{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f5c9f22c",
   "metadata": {},
   "source": [
    "# MLPerf Bert Large\n",
    "## Implement INT8 Bert Large in PyTorch\n",
    "\n",
    "Model downloaded from [here](https://zenodo.org/record/4792496)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "c9d8bdea",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import QDQBertForQuestionAnswering, QDQBertModel, AutoConfig\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "d55b6f51",
   "metadata": {},
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "\nQDQBertForQuestionAnswering requires the pytorch-quantization library but it was not found in your environment. You can install it with pip:\n`pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`\n\nQDQBertForQuestionAnswering requires the PyTorch library but it was not found in your environment. Checkout the instructions on the\ninstallation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.\n",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Input \u001b[0;32mIn [23]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m config \u001b[38;5;241m=\u001b[39m AutoConfig\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbert-large-uncased\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mQDQBertForQuestionAnswering\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniforge3/envs/trans-fat/lib/python3.9/site-packages/transformers/utils/dummy_pytorch_quantization_and_torch_objects.py:34\u001b[0m, in \u001b[0;36mQDQBertForQuestionAnswering.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 34\u001b[0m     \u001b[43mrequires_backends\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpytorch_quantization\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtorch\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniforge3/envs/trans-fat/lib/python3.9/site-packages/transformers/file_utils.py:831\u001b[0m, in \u001b[0;36mrequires_backends\u001b[0;34m(obj, backends)\u001b[0m\n\u001b[1;32m    829\u001b[0m name \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(obj, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__name__\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\n\u001b[1;32m    830\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mall\u001b[39m(BACKENDS_MAPPING[backend][\u001b[38;5;241m0\u001b[39m]() \u001b[38;5;28;01mfor\u001b[39;00m backend \u001b[38;5;129;01min\u001b[39;00m backends):\n\u001b[0;32m--> 831\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([BACKENDS_MAPPING[backend][\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mformat(name) \u001b[38;5;28;01mfor\u001b[39;00m backend \u001b[38;5;129;01min\u001b[39;00m backends]))\n",
      "\u001b[0;31mImportError\u001b[0m: \nQDQBertForQuestionAnswering requires the pytorch-quantization library but it was not found in your environment. You can install it with pip:\n`pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`\n\nQDQBertForQuestionAnswering requires the PyTorch library but it was not found in your environment. Checkout the instructions on the\ninstallation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.\n"
     ]
    }
   ],
   "source": [
    "config = AutoConfig.from_pretrained('bert-large-uncased')\n",
    "model = QDQBertForQuestionAnswering()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d40e25fd",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4fcbc174",
   "metadata": {},
   "outputs": [
    {
     "ename": "RuntimeError",
     "evalue": "Error(s) in loading state_dict for BertModel:\n\tMissing key(s) in state_dict: \"embeddings.position_ids\", \"embeddings.word_embeddings.weight\", \"embeddings.position_embeddings.weight\", \"embeddings.token_type_embeddings.weight\", \"embeddings.LayerNorm.weight\", \"embeddings.LayerNorm.bias\", \"encoder.layer.0.attention.self.query.weight\", \"encoder.layer.0.attention.self.query.bias\", \"encoder.layer.0.attention.self.key.weight\", \"encoder.layer.0.attention.self.key.bias\", \"encoder.layer.0.attention.self.value.weight\", \"encoder.layer.0.attention.self.value.bias\", \"encoder.layer.0.attention.output.dense.weight\", \"encoder.layer.0.attention.output.dense.bias\", \"encoder.layer.0.attention.output.LayerNorm.weight\", \"encoder.layer.0.attention.output.LayerNorm.bias\", \"encoder.layer.0.intermediate.dense.weight\", \"encoder.layer.0.intermediate.dense.bias\", \"encoder.layer.0.output.dense.weight\", \"encoder.layer.0.output.dense.bias\", \"encoder.layer.0.output.LayerNorm.weight\", \"encoder.layer.0.output.LayerNorm.bias\", \"encoder.layer.1.attention.self.query.weight\", \"encoder.layer.1.attention.self.query.bias\", \"encoder.layer.1.attention.self.key.weight\", \"encoder.layer.1.attention.self.key.bias\", \"encoder.layer.1.attention.self.value.weight\", \"encoder.layer.1.attention.self.value.bias\", \"encoder.layer.1.attention.output.dense.weight\", \"encoder.layer.1.attention.output.dense.bias\", \"encoder.layer.1.attention.output.LayerNorm.weight\", \"encoder.layer.1.attention.output.LayerNorm.bias\", \"encoder.layer.1.intermediate.dense.weight\", \"encoder.layer.1.intermediate.dense.bias\", \"encoder.layer.1.output.dense.weight\", \"encoder.layer.1.output.dense.bias\", \"encoder.layer.1.output.LayerNorm.weight\", \"encoder.layer.1.output.LayerNorm.bias\", \"encoder.layer.2.attention.self.query.weight\", \"encoder.layer.2.attention.self.query.bias\", \"encoder.layer.2.attention.self.key.weight\", \"encoder.layer.2.attention.self.key.bias\", \"encoder.layer.2.attention.self.value.weight\", \"encoder.layer.2.attention.self.value.bias\", \"encoder.layer.2.attention.output.dense.weight\", \"encoder.layer.2.attention.output.dense.bias\", \"encoder.layer.2.attention.output.LayerNorm.weight\", \"encoder.layer.2.attention.output.LayerNorm.bias\", \"encoder.layer.2.intermediate.dense.weight\", \"encoder.layer.2.intermediate.dense.bias\", \"encoder.layer.2.output.dense.weight\", \"encoder.layer.2.output.dense.bias\", \"encoder.layer.2.output.LayerNorm.weight\", \"encoder.layer.2.output.LayerNorm.bias\", \"encoder.layer.3.attention.self.query.weight\", \"encoder.layer.3.attention.self.query.bias\", \"encoder.layer.3.attention.self.key.weight\", \"encoder.layer.3.attention.self.key.bias\", \"encoder.layer.3.attention.self.value.weight\", \"encoder.layer.3.attention.self.value.bias\", \"encoder.layer.3.attention.output.dense.weight\", \"encoder.layer.3.attention.output.dense.bias\", \"encoder.layer.3.attention.output.LayerNorm.weight\", \"encoder.layer.3.attention.output.LayerNorm.bias\", \"encoder.layer.3.intermediate.dense.weight\", \"encoder.layer.3.intermediate.dense.bias\", \"encoder.layer.3.output.dense.weight\", \"encoder.layer.3.output.dense.bias\", \"encoder.layer.3.output.LayerNorm.weight\", \"encoder.layer.3.output.LayerNorm.bias\", \"encoder.layer.4.attention.self.query.weight\", \"encoder.layer.4.attention.self.query.bias\", \"encoder.layer.4.attention.self.key.weight\", \"encoder.layer.4.attention.self.key.bias\", \"encoder.layer.4.attention.self.value.weight\", \"encoder.layer.4.attention.self.value.bias\", \"encoder.layer.4.attention.output.dense.weight\", \"encoder.layer.4.attention.output.dense.bias\", \"encoder.layer.4.attention.output.LayerNorm.weight\", \"encoder.layer.4.attention.output.LayerNorm.bias\", \"encoder.layer.4.intermediate.dense.weight\", \"encoder.layer.4.intermediate.dense.bias\", \"encoder.layer.4.output.dense.weight\", \"encoder.layer.4.output.dense.bias\", \"encoder.layer.4.output.LayerNorm.weight\", \"encoder.layer.4.output.LayerNorm.bias\", \"encoder.layer.5.attention.self.query.weight\", \"encoder.layer.5.attention.self.query.bias\", \"encoder.layer.5.attention.self.key.weight\", \"encoder.layer.5.attention.self.key.bias\", \"encoder.layer.5.attention.self.value.weight\", \"encoder.layer.5.attention.self.value.bias\", \"encoder.layer.5.attention.output.dense.weight\", \"encoder.layer.5.attention.output.dense.bias\", \"encoder.layer.5.attention.output.LayerNorm.weight\", \"encoder.layer.5.attention.output.LayerNorm.bias\", \"encoder.layer.5.intermediate.dense.weight\", \"encoder.layer.5.intermediate.dense.bias\", \"encoder.layer.5.output.dense.weight\", \"encoder.layer.5.output.dense.bias\", \"encoder.layer.5.output.LayerNorm.weight\", \"encoder.layer.5.output.LayerNorm.bias\", \"encoder.layer.6.attention.self.query.weight\", \"encoder.layer.6.attention.self.query.bias\", \"encoder.layer.6.attention.self.key.weight\", \"encoder.layer.6.attention.self.key.bias\", \"encoder.layer.6.attention.self.value.weight\", \"encoder.layer.6.attention.self.value.bias\", \"encoder.layer.6.attention.output.dense.weight\", \"encoder.layer.6.attention.output.dense.bias\", \"encoder.layer.6.attention.output.LayerNorm.weight\", \"encoder.layer.6.attention.output.LayerNorm.bias\", \"encoder.layer.6.intermediate.dense.weight\", \"encoder.layer.6.intermediate.dense.bias\", \"encoder.layer.6.output.dense.weight\", \"encoder.layer.6.output.dense.bias\", \"encoder.layer.6.output.LayerNorm.weight\", \"encoder.layer.6.output.LayerNorm.bias\", \"encoder.layer.7.attention.self.query.weight\", \"encoder.layer.7.attention.self.query.bias\", \"encoder.layer.7.attention.self.key.weight\", \"encoder.layer.7.attention.self.key.bias\", \"encoder.layer.7.attention.self.value.weight\", \"encoder.layer.7.attention.self.value.bias\", \"encoder.layer.7.attention.output.dense.weight\", \"encoder.layer.7.attention.output.dense.bias\", \"encoder.layer.7.attention.output.LayerNorm.weight\", \"encoder.layer.7.attention.output.LayerNorm.bias\", \"encoder.layer.7.intermediate.dense.weight\", \"encoder.layer.7.intermediate.dense.bias\", \"encoder.layer.7.output.dense.weight\", \"encoder.layer.7.output.dense.bias\", \"encoder.layer.7.output.LayerNorm.weight\", \"encoder.layer.7.output.LayerNorm.bias\", \"encoder.layer.8.attention.self.query.weight\", \"encoder.layer.8.attention.self.query.bias\", \"encoder.layer.8.attention.self.key.weight\", \"encoder.layer.8.attention.self.key.bias\", \"encoder.layer.8.attention.self.value.weight\", \"encoder.layer.8.attention.self.value.bias\", \"encoder.layer.8.attention.output.dense.weight\", \"encoder.layer.8.attention.output.dense.bias\", \"encoder.layer.8.attention.output.LayerNorm.weight\", \"encoder.layer.8.attention.output.LayerNorm.bias\", \"encoder.layer.8.intermediate.dense.weight\", \"encoder.layer.8.intermediate.dense.bias\", \"encoder.layer.8.output.dense.weight\", \"encoder.layer.8.output.dense.bias\", \"encoder.layer.8.output.LayerNorm.weight\", \"encoder.layer.8.output.LayerNorm.bias\", \"encoder.layer.9.attention.self.query.weight\", \"encoder.layer.9.attention.self.query.bias\", \"encoder.layer.9.attention.self.key.weight\", \"encoder.layer.9.attention.self.key.bias\", \"encoder.layer.9.attention.self.value.weight\", \"encoder.layer.9.attention.self.value.bias\", \"encoder.layer.9.attention.output.dense.weight\", \"encoder.layer.9.attention.output.dense.bias\", \"encoder.layer.9.attention.output.LayerNorm.weight\", \"encoder.layer.9.attention.output.LayerNorm.bias\", \"encoder.layer.9.intermediate.dense.weight\", \"encoder.layer.9.intermediate.dense.bias\", \"encoder.layer.9.output.dense.weight\", \"encoder.layer.9.output.dense.bias\", \"encoder.layer.9.output.LayerNorm.weight\", \"encoder.layer.9.output.LayerNorm.bias\", \"encoder.layer.10.attention.self.query.weight\", \"encoder.layer.10.attention.self.query.bias\", \"encoder.layer.10.attention.self.key.weight\", \"encoder.layer.10.attention.self.key.bias\", \"encoder.layer.10.attention.self.value.weight\", \"encoder.layer.10.attention.self.value.bias\", \"encoder.layer.10.attention.output.dense.weight\", \"encoder.layer.10.attention.output.dense.bias\", \"encoder.layer.10.attention.output.LayerNorm.weight\", \"encoder.layer.10.attention.output.LayerNorm.bias\", \"encoder.layer.10.intermediate.dense.weight\", \"encoder.layer.10.intermediate.dense.bias\", \"encoder.layer.10.output.dense.weight\", \"encoder.layer.10.output.dense.bias\", \"encoder.layer.10.output.LayerNorm.weight\", \"encoder.layer.10.output.LayerNorm.bias\", \"encoder.layer.11.attention.self.query.weight\", \"encoder.layer.11.attention.self.query.bias\", \"encoder.layer.11.attention.self.key.weight\", \"encoder.layer.11.attention.self.key.bias\", \"encoder.layer.11.attention.self.value.weight\", \"encoder.layer.11.attention.self.value.bias\", \"encoder.layer.11.attention.output.dense.weight\", \"encoder.layer.11.attention.output.dense.bias\", \"encoder.layer.11.attention.output.LayerNorm.weight\", \"encoder.layer.11.attention.output.LayerNorm.bias\", \"encoder.layer.11.intermediate.dense.weight\", \"encoder.layer.11.intermediate.dense.bias\", \"encoder.layer.11.output.dense.weight\", \"encoder.layer.11.output.dense.bias\", \"encoder.layer.11.output.LayerNorm.weight\", \"encoder.layer.11.output.LayerNorm.bias\", \"encoder.layer.12.attention.self.query.weight\", \"encoder.layer.12.attention.self.query.bias\", \"encoder.layer.12.attention.self.key.weight\", \"encoder.layer.12.attention.self.key.bias\", \"encoder.layer.12.attention.self.value.weight\", \"encoder.layer.12.attention.self.value.bias\", \"encoder.layer.12.attention.output.dense.weight\", \"encoder.layer.12.attention.output.dense.bias\", \"encoder.layer.12.attention.output.LayerNorm.weight\", \"encoder.layer.12.attention.output.LayerNorm.bias\", \"encoder.layer.12.intermediate.dense.weight\", \"encoder.layer.12.intermediate.dense.bias\", \"encoder.layer.12.output.dense.weight\", \"encoder.layer.12.output.dense.bias\", \"encoder.layer.12.output.LayerNorm.weight\", \"encoder.layer.12.output.LayerNorm.bias\", \"encoder.layer.13.attention.self.query.weight\", \"encoder.layer.13.attention.self.query.bias\", \"encoder.layer.13.attention.self.key.weight\", \"encoder.layer.13.attention.self.key.bias\", \"encoder.layer.13.attention.self.value.weight\", \"encoder.layer.13.attention.self.value.bias\", \"encoder.layer.13.attention.output.dense.weight\", \"encoder.layer.13.attention.output.dense.bias\", \"encoder.layer.13.attention.output.LayerNorm.weight\", \"encoder.layer.13.attention.output.LayerNorm.bias\", \"encoder.layer.13.intermediate.dense.weight\", \"encoder.layer.13.intermediate.dense.bias\", \"encoder.layer.13.output.dense.weight\", \"encoder.layer.13.output.dense.bias\", \"encoder.layer.13.output.LayerNorm.weight\", \"encoder.layer.13.output.LayerNorm.bias\", \"encoder.layer.14.attention.self.query.weight\", \"encoder.layer.14.attention.self.query.bias\", \"encoder.layer.14.attention.self.key.weight\", \"encoder.layer.14.attention.self.key.bias\", \"encoder.layer.14.attention.self.value.weight\", \"encoder.layer.14.attention.self.value.bias\", \"encoder.layer.14.attention.output.dense.weight\", \"encoder.layer.14.attention.output.dense.bias\", \"encoder.layer.14.attention.output.LayerNorm.weight\", \"encoder.layer.14.attention.output.LayerNorm.bias\", \"encoder.layer.14.intermediate.dense.weight\", \"encoder.layer.14.intermediate.dense.bias\", \"encoder.layer.14.output.dense.weight\", \"encoder.layer.14.output.dense.bias\", \"encoder.layer.14.output.LayerNorm.weight\", \"encoder.layer.14.output.LayerNorm.bias\", \"encoder.layer.15.attention.self.query.weight\", \"encoder.layer.15.attention.self.query.bias\", \"encoder.layer.15.attention.self.key.weight\", \"encoder.layer.15.attention.self.key.bias\", \"encoder.layer.15.attention.self.value.weight\", \"encoder.layer.15.attention.self.value.bias\", \"encoder.layer.15.attention.output.dense.weight\", \"encoder.layer.15.attention.output.dense.bias\", \"encoder.layer.15.attention.output.LayerNorm.weight\", \"encoder.layer.15.attention.output.LayerNorm.bias\", \"encoder.layer.15.intermediate.dense.weight\", \"encoder.layer.15.intermediate.dense.bias\", \"encoder.layer.15.output.dense.weight\", \"encoder.layer.15.output.dense.bias\", \"encoder.layer.15.output.LayerNorm.weight\", \"encoder.layer.15.output.LayerNorm.bias\", \"encoder.layer.16.attention.self.query.weight\", \"encoder.layer.16.attention.self.query.bias\", \"encoder.layer.16.attention.self.key.weight\", \"encoder.layer.16.attention.self.key.bias\", \"encoder.layer.16.attention.self.value.weight\", \"encoder.layer.16.attention.self.value.bias\", \"encoder.layer.16.attention.output.dense.weight\", \"encoder.layer.16.attention.output.dense.bias\", \"encoder.layer.16.attention.output.LayerNorm.weight\", \"encoder.layer.16.attention.output.LayerNorm.bias\", \"encoder.layer.16.intermediate.dense.weight\", \"encoder.layer.16.intermediate.dense.bias\", \"encoder.layer.16.output.dense.weight\", \"encoder.layer.16.output.dense.bias\", \"encoder.layer.16.output.LayerNorm.weight\", \"encoder.layer.16.output.LayerNorm.bias\", \"encoder.layer.17.attention.self.query.weight\", \"encoder.layer.17.attention.self.query.bias\", \"encoder.layer.17.attention.self.key.weight\", \"encoder.layer.17.attention.self.key.bias\", \"encoder.layer.17.attention.self.value.weight\", \"encoder.layer.17.attention.self.value.bias\", \"encoder.layer.17.attention.output.dense.weight\", \"encoder.layer.17.attention.output.dense.bias\", \"encoder.layer.17.attention.output.LayerNorm.weight\", \"encoder.layer.17.attention.output.LayerNorm.bias\", \"encoder.layer.17.intermediate.dense.weight\", \"encoder.layer.17.intermediate.dense.bias\", \"encoder.layer.17.output.dense.weight\", \"encoder.layer.17.output.dense.bias\", \"encoder.layer.17.output.LayerNorm.weight\", \"encoder.layer.17.output.LayerNorm.bias\", \"encoder.layer.18.attention.self.query.weight\", \"encoder.layer.18.attention.self.query.bias\", \"encoder.layer.18.attention.self.key.weight\", \"encoder.layer.18.attention.self.key.bias\", \"encoder.layer.18.attention.self.value.weight\", \"encoder.layer.18.attention.self.value.bias\", \"encoder.layer.18.attention.output.dense.weight\", \"encoder.layer.18.attention.output.dense.bias\", \"encoder.layer.18.attention.output.LayerNorm.weight\", \"encoder.layer.18.attention.output.LayerNorm.bias\", \"encoder.layer.18.intermediate.dense.weight\", \"encoder.layer.18.intermediate.dense.bias\", \"encoder.layer.18.output.dense.weight\", \"encoder.layer.18.output.dense.bias\", \"encoder.layer.18.output.LayerNorm.weight\", \"encoder.layer.18.output.LayerNorm.bias\", \"encoder.layer.19.attention.self.query.weight\", \"encoder.layer.19.attention.self.query.bias\", \"encoder.layer.19.attention.self.key.weight\", \"encoder.layer.19.attention.self.key.bias\", \"encoder.layer.19.attention.self.value.weight\", \"encoder.layer.19.attention.self.value.bias\", \"encoder.layer.19.attention.output.dense.weight\", \"encoder.layer.19.attention.output.dense.bias\", \"encoder.layer.19.attention.output.LayerNorm.weight\", \"encoder.layer.19.attention.output.LayerNorm.bias\", \"encoder.layer.19.intermediate.dense.weight\", \"encoder.layer.19.intermediate.dense.bias\", \"encoder.layer.19.output.dense.weight\", \"encoder.layer.19.output.dense.bias\", \"encoder.layer.19.output.LayerNorm.weight\", \"encoder.layer.19.output.LayerNorm.bias\", \"encoder.layer.20.attention.self.query.weight\", \"encoder.layer.20.attention.self.query.bias\", \"encoder.layer.20.attention.self.key.weight\", \"encoder.layer.20.attention.self.key.bias\", \"encoder.layer.20.attention.self.value.weight\", \"encoder.layer.20.attention.self.value.bias\", \"encoder.layer.20.attention.output.dense.weight\", \"encoder.layer.20.attention.output.dense.bias\", \"encoder.layer.20.attention.output.LayerNorm.weight\", \"encoder.layer.20.attention.output.LayerNorm.bias\", \"encoder.layer.20.intermediate.dense.weight\", \"encoder.layer.20.intermediate.dense.bias\", \"encoder.layer.20.output.dense.weight\", \"encoder.layer.20.output.dense.bias\", \"encoder.layer.20.output.LayerNorm.weight\", \"encoder.layer.20.output.LayerNorm.bias\", \"encoder.layer.21.attention.self.query.weight\", \"encoder.layer.21.attention.self.query.bias\", \"encoder.layer.21.attention.self.key.weight\", \"encoder.layer.21.attention.self.key.bias\", \"encoder.layer.21.attention.self.value.weight\", \"encoder.layer.21.attention.self.value.bias\", \"encoder.layer.21.attention.output.dense.weight\", \"encoder.layer.21.attention.output.dense.bias\", \"encoder.layer.21.attention.output.LayerNorm.weight\", \"encoder.layer.21.attention.output.LayerNorm.bias\", \"encoder.layer.21.intermediate.dense.weight\", \"encoder.layer.21.intermediate.dense.bias\", \"encoder.layer.21.output.dense.weight\", \"encoder.layer.21.output.dense.bias\", \"encoder.layer.21.output.LayerNorm.weight\", \"encoder.layer.21.output.LayerNorm.bias\", \"encoder.layer.22.attention.self.query.weight\", \"encoder.layer.22.attention.self.query.bias\", \"encoder.layer.22.attention.self.key.weight\", \"encoder.layer.22.attention.self.key.bias\", \"encoder.layer.22.attention.self.value.weight\", \"encoder.layer.22.attention.self.value.bias\", \"encoder.layer.22.attention.output.dense.weight\", \"encoder.layer.22.attention.output.dense.bias\", \"encoder.layer.22.attention.output.LayerNorm.weight\", \"encoder.layer.22.attention.output.LayerNorm.bias\", \"encoder.layer.22.intermediate.dense.weight\", \"encoder.layer.22.intermediate.dense.bias\", \"encoder.layer.22.output.dense.weight\", \"encoder.layer.22.output.dense.bias\", \"encoder.layer.22.output.LayerNorm.weight\", \"encoder.layer.22.output.LayerNorm.bias\", \"encoder.layer.23.attention.self.query.weight\", \"encoder.layer.23.attention.self.query.bias\", \"encoder.layer.23.attention.self.key.weight\", \"encoder.layer.23.attention.self.key.bias\", \"encoder.layer.23.attention.self.value.weight\", \"encoder.layer.23.attention.self.value.bias\", \"encoder.layer.23.attention.output.dense.weight\", \"encoder.layer.23.attention.output.dense.bias\", \"encoder.layer.23.attention.output.LayerNorm.weight\", \"encoder.layer.23.attention.output.LayerNorm.bias\", \"encoder.layer.23.intermediate.dense.weight\", \"encoder.layer.23.intermediate.dense.bias\", \"encoder.layer.23.output.dense.weight\", \"encoder.layer.23.output.dense.bias\", \"encoder.layer.23.output.LayerNorm.weight\", \"encoder.layer.23.output.LayerNorm.bias\", \"pooler.dense.weight\", \"pooler.dense.bias\". \n\tUnexpected key(s) in state_dict: \"bert.embeddings.word_embeddings.weight\", \"bert.embeddings.position_embeddings.weight\", \"bert.embeddings.token_type_embeddings.weight\", \"bert.embeddings.LayerNorm.weight\", \"bert.embeddings.LayerNorm.bias\", \"bert.encoder.layer.0.attention.self.query.weight\", \"bert.encoder.layer.0.attention.self.query.bias\", \"bert.encoder.layer.0.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.key.weight\", \"bert.encoder.layer.0.attention.self.key.bias\", \"bert.encoder.layer.0.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.value.weight\", \"bert.encoder.layer.0.attention.self.value.bias\", \"bert.encoder.layer.0.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.0.attention.output.dense.weight\", \"bert.encoder.layer.0.attention.output.dense.bias\", \"bert.encoder.layer.0.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.0.attention.output.LayerNorm.weight\", \"bert.encoder.layer.0.attention.output.LayerNorm.bias\", \"bert.encoder.layer.0.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.0.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.0.intermediate.dense.weight\", \"bert.encoder.layer.0.intermediate.dense.bias\", \"bert.encoder.layer.0.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.0.output.dense.weight\", \"bert.encoder.layer.0.output.dense.bias\", \"bert.encoder.layer.0.output.dense._input_quantizer._amax\", \"bert.encoder.layer.0.output.LayerNorm.weight\", \"bert.encoder.layer.0.output.LayerNorm.bias\", \"bert.encoder.layer.0.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.0.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.query.weight\", \"bert.encoder.layer.1.attention.self.query.bias\", \"bert.encoder.layer.1.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.key.weight\", \"bert.encoder.layer.1.attention.self.key.bias\", \"bert.encoder.layer.1.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.value.weight\", \"bert.encoder.layer.1.attention.self.value.bias\", \"bert.encoder.layer.1.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.1.attention.output.dense.weight\", \"bert.encoder.layer.1.attention.output.dense.bias\", \"bert.encoder.layer.1.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.1.attention.output.LayerNorm.weight\", \"bert.encoder.layer.1.attention.output.LayerNorm.bias\", \"bert.encoder.layer.1.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.1.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.1.intermediate.dense.weight\", \"bert.encoder.layer.1.intermediate.dense.bias\", \"bert.encoder.layer.1.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.1.output.dense.weight\", \"bert.encoder.layer.1.output.dense.bias\", \"bert.encoder.layer.1.output.dense._input_quantizer._amax\", \"bert.encoder.layer.1.output.LayerNorm.weight\", \"bert.encoder.layer.1.output.LayerNorm.bias\", \"bert.encoder.layer.1.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.1.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.query.weight\", \"bert.encoder.layer.2.attention.self.query.bias\", \"bert.encoder.layer.2.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.key.weight\", \"bert.encoder.layer.2.attention.self.key.bias\", \"bert.encoder.layer.2.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.value.weight\", \"bert.encoder.layer.2.attention.self.value.bias\", \"bert.encoder.layer.2.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.2.attention.output.dense.weight\", \"bert.encoder.layer.2.attention.output.dense.bias\", \"bert.encoder.layer.2.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.2.attention.output.LayerNorm.weight\", \"bert.encoder.layer.2.attention.output.LayerNorm.bias\", \"bert.encoder.layer.2.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.2.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.2.intermediate.dense.weight\", \"bert.encoder.layer.2.intermediate.dense.bias\", \"bert.encoder.layer.2.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.2.output.dense.weight\", \"bert.encoder.layer.2.output.dense.bias\", \"bert.encoder.layer.2.output.dense._input_quantizer._amax\", \"bert.encoder.layer.2.output.LayerNorm.weight\", \"bert.encoder.layer.2.output.LayerNorm.bias\", \"bert.encoder.layer.2.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.2.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.query.weight\", \"bert.encoder.layer.3.attention.self.query.bias\", \"bert.encoder.layer.3.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.key.weight\", \"bert.encoder.layer.3.attention.self.key.bias\", \"bert.encoder.layer.3.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.value.weight\", \"bert.encoder.layer.3.attention.self.value.bias\", \"bert.encoder.layer.3.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.3.attention.output.dense.weight\", \"bert.encoder.layer.3.attention.output.dense.bias\", \"bert.encoder.layer.3.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.3.attention.output.LayerNorm.weight\", \"bert.encoder.layer.3.attention.output.LayerNorm.bias\", \"bert.encoder.layer.3.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.3.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.3.intermediate.dense.weight\", \"bert.encoder.layer.3.intermediate.dense.bias\", \"bert.encoder.layer.3.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.3.output.dense.weight\", \"bert.encoder.layer.3.output.dense.bias\", \"bert.encoder.layer.3.output.dense._input_quantizer._amax\", \"bert.encoder.layer.3.output.LayerNorm.weight\", \"bert.encoder.layer.3.output.LayerNorm.bias\", \"bert.encoder.layer.3.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.3.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.query.weight\", \"bert.encoder.layer.4.attention.self.query.bias\", \"bert.encoder.layer.4.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.key.weight\", \"bert.encoder.layer.4.attention.self.key.bias\", \"bert.encoder.layer.4.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.value.weight\", \"bert.encoder.layer.4.attention.self.value.bias\", \"bert.encoder.layer.4.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.4.attention.output.dense.weight\", \"bert.encoder.layer.4.attention.output.dense.bias\", \"bert.encoder.layer.4.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.4.attention.output.LayerNorm.weight\", \"bert.encoder.layer.4.attention.output.LayerNorm.bias\", \"bert.encoder.layer.4.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.4.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.4.intermediate.dense.weight\", \"bert.encoder.layer.4.intermediate.dense.bias\", \"bert.encoder.layer.4.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.4.output.dense.weight\", \"bert.encoder.layer.4.output.dense.bias\", \"bert.encoder.layer.4.output.dense._input_quantizer._amax\", \"bert.encoder.layer.4.output.LayerNorm.weight\", \"bert.encoder.layer.4.output.LayerNorm.bias\", \"bert.encoder.layer.4.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.4.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.query.weight\", \"bert.encoder.layer.5.attention.self.query.bias\", \"bert.encoder.layer.5.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.key.weight\", \"bert.encoder.layer.5.attention.self.key.bias\", \"bert.encoder.layer.5.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.value.weight\", \"bert.encoder.layer.5.attention.self.value.bias\", \"bert.encoder.layer.5.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.5.attention.output.dense.weight\", \"bert.encoder.layer.5.attention.output.dense.bias\", \"bert.encoder.layer.5.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.5.attention.output.LayerNorm.weight\", \"bert.encoder.layer.5.attention.output.LayerNorm.bias\", \"bert.encoder.layer.5.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.5.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.5.intermediate.dense.weight\", \"bert.encoder.layer.5.intermediate.dense.bias\", \"bert.encoder.layer.5.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.5.output.dense.weight\", \"bert.encoder.layer.5.output.dense.bias\", \"bert.encoder.layer.5.output.dense._input_quantizer._amax\", \"bert.encoder.layer.5.output.LayerNorm.weight\", \"bert.encoder.layer.5.output.LayerNorm.bias\", \"bert.encoder.layer.5.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.5.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.query.weight\", \"bert.encoder.layer.6.attention.self.query.bias\", \"bert.encoder.layer.6.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.key.weight\", \"bert.encoder.layer.6.attention.self.key.bias\", \"bert.encoder.layer.6.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.value.weight\", \"bert.encoder.layer.6.attention.self.value.bias\", \"bert.encoder.layer.6.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.6.attention.output.dense.weight\", \"bert.encoder.layer.6.attention.output.dense.bias\", \"bert.encoder.layer.6.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.6.attention.output.LayerNorm.weight\", \"bert.encoder.layer.6.attention.output.LayerNorm.bias\", \"bert.encoder.layer.6.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.6.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.6.intermediate.dense.weight\", \"bert.encoder.layer.6.intermediate.dense.bias\", \"bert.encoder.layer.6.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.6.output.dense.weight\", \"bert.encoder.layer.6.output.dense.bias\", \"bert.encoder.layer.6.output.dense._input_quantizer._amax\", \"bert.encoder.layer.6.output.LayerNorm.weight\", \"bert.encoder.layer.6.output.LayerNorm.bias\", \"bert.encoder.layer.6.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.6.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.query.weight\", \"bert.encoder.layer.7.attention.self.query.bias\", \"bert.encoder.layer.7.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.key.weight\", \"bert.encoder.layer.7.attention.self.key.bias\", \"bert.encoder.layer.7.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.value.weight\", \"bert.encoder.layer.7.attention.self.value.bias\", \"bert.encoder.layer.7.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.7.attention.output.dense.weight\", \"bert.encoder.layer.7.attention.output.dense.bias\", \"bert.encoder.layer.7.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.7.attention.output.LayerNorm.weight\", \"bert.encoder.layer.7.attention.output.LayerNorm.bias\", \"bert.encoder.layer.7.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.7.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.7.intermediate.dense.weight\", \"bert.encoder.layer.7.intermediate.dense.bias\", \"bert.encoder.layer.7.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.7.output.dense.weight\", \"bert.encoder.layer.7.output.dense.bias\", \"bert.encoder.layer.7.output.dense._input_quantizer._amax\", \"bert.encoder.layer.7.output.LayerNorm.weight\", \"bert.encoder.layer.7.output.LayerNorm.bias\", \"bert.encoder.layer.7.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.7.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.query.weight\", \"bert.encoder.layer.8.attention.self.query.bias\", \"bert.encoder.layer.8.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.key.weight\", \"bert.encoder.layer.8.attention.self.key.bias\", \"bert.encoder.layer.8.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.value.weight\", \"bert.encoder.layer.8.attention.self.value.bias\", \"bert.encoder.layer.8.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.8.attention.output.dense.weight\", \"bert.encoder.layer.8.attention.output.dense.bias\", \"bert.encoder.layer.8.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.8.attention.output.LayerNorm.weight\", \"bert.encoder.layer.8.attention.output.LayerNorm.bias\", \"bert.encoder.layer.8.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.8.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.8.intermediate.dense.weight\", \"bert.encoder.layer.8.intermediate.dense.bias\", \"bert.encoder.layer.8.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.8.output.dense.weight\", \"bert.encoder.layer.8.output.dense.bias\", \"bert.encoder.layer.8.output.dense._input_quantizer._amax\", \"bert.encoder.layer.8.output.LayerNorm.weight\", \"bert.encoder.layer.8.output.LayerNorm.bias\", \"bert.encoder.layer.8.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.8.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.query.weight\", \"bert.encoder.layer.9.attention.self.query.bias\", \"bert.encoder.layer.9.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.key.weight\", \"bert.encoder.layer.9.attention.self.key.bias\", \"bert.encoder.layer.9.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.value.weight\", \"bert.encoder.layer.9.attention.self.value.bias\", \"bert.encoder.layer.9.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.9.attention.output.dense.weight\", \"bert.encoder.layer.9.attention.output.dense.bias\", \"bert.encoder.layer.9.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.9.attention.output.LayerNorm.weight\", \"bert.encoder.layer.9.attention.output.LayerNorm.bias\", \"bert.encoder.layer.9.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.9.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.9.intermediate.dense.weight\", \"bert.encoder.layer.9.intermediate.dense.bias\", \"bert.encoder.layer.9.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.9.output.dense.weight\", \"bert.encoder.layer.9.output.dense.bias\", \"bert.encoder.layer.9.output.dense._input_quantizer._amax\", \"bert.encoder.layer.9.output.LayerNorm.weight\", \"bert.encoder.layer.9.output.LayerNorm.bias\", \"bert.encoder.layer.9.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.9.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.query.weight\", \"bert.encoder.layer.10.attention.self.query.bias\", \"bert.encoder.layer.10.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.key.weight\", \"bert.encoder.layer.10.attention.self.key.bias\", \"bert.encoder.layer.10.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.value.weight\", \"bert.encoder.layer.10.attention.self.value.bias\", \"bert.encoder.layer.10.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.10.attention.output.dense.weight\", \"bert.encoder.layer.10.attention.output.dense.bias\", \"bert.encoder.layer.10.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.10.attention.output.LayerNorm.weight\", \"bert.encoder.layer.10.attention.output.LayerNorm.bias\", \"bert.encoder.layer.10.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.10.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.10.intermediate.dense.weight\", \"bert.encoder.layer.10.intermediate.dense.bias\", \"bert.encoder.layer.10.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.10.output.dense.weight\", \"bert.encoder.layer.10.output.dense.bias\", \"bert.encoder.layer.10.output.dense._input_quantizer._amax\", \"bert.encoder.layer.10.output.LayerNorm.weight\", \"bert.encoder.layer.10.output.LayerNorm.bias\", \"bert.encoder.layer.10.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.10.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.query.weight\", \"bert.encoder.layer.11.attention.self.query.bias\", \"bert.encoder.layer.11.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.key.weight\", \"bert.encoder.layer.11.attention.self.key.bias\", \"bert.encoder.layer.11.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.value.weight\", \"bert.encoder.layer.11.attention.self.value.bias\", \"bert.encoder.layer.11.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.11.attention.output.dense.weight\", \"bert.encoder.layer.11.attention.output.dense.bias\", \"bert.encoder.layer.11.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.11.attention.output.LayerNorm.weight\", \"bert.encoder.layer.11.attention.output.LayerNorm.bias\", \"bert.encoder.layer.11.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.11.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.11.intermediate.dense.weight\", \"bert.encoder.layer.11.intermediate.dense.bias\", \"bert.encoder.layer.11.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.11.output.dense.weight\", \"bert.encoder.layer.11.output.dense.bias\", \"bert.encoder.layer.11.output.dense._input_quantizer._amax\", \"bert.encoder.layer.11.output.LayerNorm.weight\", \"bert.encoder.layer.11.output.LayerNorm.bias\", \"bert.encoder.layer.11.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.11.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.query.weight\", \"bert.encoder.layer.12.attention.self.query.bias\", \"bert.encoder.layer.12.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.key.weight\", \"bert.encoder.layer.12.attention.self.key.bias\", \"bert.encoder.layer.12.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.value.weight\", \"bert.encoder.layer.12.attention.self.value.bias\", \"bert.encoder.layer.12.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.12.attention.output.dense.weight\", \"bert.encoder.layer.12.attention.output.dense.bias\", \"bert.encoder.layer.12.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.12.attention.output.LayerNorm.weight\", \"bert.encoder.layer.12.attention.output.LayerNorm.bias\", \"bert.encoder.layer.12.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.12.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.12.intermediate.dense.weight\", \"bert.encoder.layer.12.intermediate.dense.bias\", \"bert.encoder.layer.12.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.12.output.dense.weight\", \"bert.encoder.layer.12.output.dense.bias\", \"bert.encoder.layer.12.output.dense._input_quantizer._amax\", \"bert.encoder.layer.12.output.LayerNorm.weight\", \"bert.encoder.layer.12.output.LayerNorm.bias\", \"bert.encoder.layer.12.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.12.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.query.weight\", \"bert.encoder.layer.13.attention.self.query.bias\", \"bert.encoder.layer.13.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.key.weight\", \"bert.encoder.layer.13.attention.self.key.bias\", \"bert.encoder.layer.13.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.value.weight\", \"bert.encoder.layer.13.attention.self.value.bias\", \"bert.encoder.layer.13.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.13.attention.output.dense.weight\", \"bert.encoder.layer.13.attention.output.dense.bias\", \"bert.encoder.layer.13.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.13.attention.output.LayerNorm.weight\", \"bert.encoder.layer.13.attention.output.LayerNorm.bias\", \"bert.encoder.layer.13.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.13.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.13.intermediate.dense.weight\", \"bert.encoder.layer.13.intermediate.dense.bias\", \"bert.encoder.layer.13.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.13.output.dense.weight\", \"bert.encoder.layer.13.output.dense.bias\", \"bert.encoder.layer.13.output.dense._input_quantizer._amax\", \"bert.encoder.layer.13.output.LayerNorm.weight\", \"bert.encoder.layer.13.output.LayerNorm.bias\", \"bert.encoder.layer.13.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.13.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.query.weight\", \"bert.encoder.layer.14.attention.self.query.bias\", \"bert.encoder.layer.14.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.key.weight\", \"bert.encoder.layer.14.attention.self.key.bias\", \"bert.encoder.layer.14.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.value.weight\", \"bert.encoder.layer.14.attention.self.value.bias\", \"bert.encoder.layer.14.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.14.attention.output.dense.weight\", \"bert.encoder.layer.14.attention.output.dense.bias\", \"bert.encoder.layer.14.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.14.attention.output.LayerNorm.weight\", \"bert.encoder.layer.14.attention.output.LayerNorm.bias\", \"bert.encoder.layer.14.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.14.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.14.intermediate.dense.weight\", \"bert.encoder.layer.14.intermediate.dense.bias\", \"bert.encoder.layer.14.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.14.output.dense.weight\", \"bert.encoder.layer.14.output.dense.bias\", \"bert.encoder.layer.14.output.dense._input_quantizer._amax\", \"bert.encoder.layer.14.output.LayerNorm.weight\", \"bert.encoder.layer.14.output.LayerNorm.bias\", \"bert.encoder.layer.14.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.14.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.query.weight\", \"bert.encoder.layer.15.attention.self.query.bias\", \"bert.encoder.layer.15.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.key.weight\", \"bert.encoder.layer.15.attention.self.key.bias\", \"bert.encoder.layer.15.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.value.weight\", \"bert.encoder.layer.15.attention.self.value.bias\", \"bert.encoder.layer.15.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.15.attention.output.dense.weight\", \"bert.encoder.layer.15.attention.output.dense.bias\", \"bert.encoder.layer.15.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.15.attention.output.LayerNorm.weight\", \"bert.encoder.layer.15.attention.output.LayerNorm.bias\", \"bert.encoder.layer.15.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.15.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.15.intermediate.dense.weight\", \"bert.encoder.layer.15.intermediate.dense.bias\", \"bert.encoder.layer.15.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.15.output.dense.weight\", \"bert.encoder.layer.15.output.dense.bias\", \"bert.encoder.layer.15.output.dense._input_quantizer._amax\", \"bert.encoder.layer.15.output.LayerNorm.weight\", \"bert.encoder.layer.15.output.LayerNorm.bias\", \"bert.encoder.layer.15.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.15.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.query.weight\", \"bert.encoder.layer.16.attention.self.query.bias\", \"bert.encoder.layer.16.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.key.weight\", \"bert.encoder.layer.16.attention.self.key.bias\", \"bert.encoder.layer.16.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.value.weight\", \"bert.encoder.layer.16.attention.self.value.bias\", \"bert.encoder.layer.16.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.16.attention.output.dense.weight\", \"bert.encoder.layer.16.attention.output.dense.bias\", \"bert.encoder.layer.16.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.16.attention.output.LayerNorm.weight\", \"bert.encoder.layer.16.attention.output.LayerNorm.bias\", \"bert.encoder.layer.16.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.16.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.16.intermediate.dense.weight\", \"bert.encoder.layer.16.intermediate.dense.bias\", \"bert.encoder.layer.16.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.16.output.dense.weight\", \"bert.encoder.layer.16.output.dense.bias\", \"bert.encoder.layer.16.output.dense._input_quantizer._amax\", \"bert.encoder.layer.16.output.LayerNorm.weight\", \"bert.encoder.layer.16.output.LayerNorm.bias\", \"bert.encoder.layer.16.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.16.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.query.weight\", \"bert.encoder.layer.17.attention.self.query.bias\", \"bert.encoder.layer.17.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.key.weight\", \"bert.encoder.layer.17.attention.self.key.bias\", \"bert.encoder.layer.17.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.value.weight\", \"bert.encoder.layer.17.attention.self.value.bias\", \"bert.encoder.layer.17.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.17.attention.output.dense.weight\", \"bert.encoder.layer.17.attention.output.dense.bias\", \"bert.encoder.layer.17.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.17.attention.output.LayerNorm.weight\", \"bert.encoder.layer.17.attention.output.LayerNorm.bias\", \"bert.encoder.layer.17.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.17.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.17.intermediate.dense.weight\", \"bert.encoder.layer.17.intermediate.dense.bias\", \"bert.encoder.layer.17.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.17.output.dense.weight\", \"bert.encoder.layer.17.output.dense.bias\", \"bert.encoder.layer.17.output.dense._input_quantizer._amax\", \"bert.encoder.layer.17.output.LayerNorm.weight\", \"bert.encoder.layer.17.output.LayerNorm.bias\", \"bert.encoder.layer.17.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.17.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.query.weight\", \"bert.encoder.layer.18.attention.self.query.bias\", \"bert.encoder.layer.18.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.key.weight\", \"bert.encoder.layer.18.attention.self.key.bias\", \"bert.encoder.layer.18.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.value.weight\", \"bert.encoder.layer.18.attention.self.value.bias\", \"bert.encoder.layer.18.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.18.attention.output.dense.weight\", \"bert.encoder.layer.18.attention.output.dense.bias\", \"bert.encoder.layer.18.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.18.attention.output.LayerNorm.weight\", \"bert.encoder.layer.18.attention.output.LayerNorm.bias\", \"bert.encoder.layer.18.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.18.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.18.intermediate.dense.weight\", \"bert.encoder.layer.18.intermediate.dense.bias\", \"bert.encoder.layer.18.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.18.output.dense.weight\", \"bert.encoder.layer.18.output.dense.bias\", \"bert.encoder.layer.18.output.dense._input_quantizer._amax\", \"bert.encoder.layer.18.output.LayerNorm.weight\", \"bert.encoder.layer.18.output.LayerNorm.bias\", \"bert.encoder.layer.18.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.18.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.query.weight\", \"bert.encoder.layer.19.attention.self.query.bias\", \"bert.encoder.layer.19.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.key.weight\", \"bert.encoder.layer.19.attention.self.key.bias\", \"bert.encoder.layer.19.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.value.weight\", \"bert.encoder.layer.19.attention.self.value.bias\", \"bert.encoder.layer.19.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.19.attention.output.dense.weight\", \"bert.encoder.layer.19.attention.output.dense.bias\", \"bert.encoder.layer.19.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.19.attention.output.LayerNorm.weight\", \"bert.encoder.layer.19.attention.output.LayerNorm.bias\", \"bert.encoder.layer.19.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.19.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.19.intermediate.dense.weight\", \"bert.encoder.layer.19.intermediate.dense.bias\", \"bert.encoder.layer.19.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.19.output.dense.weight\", \"bert.encoder.layer.19.output.dense.bias\", \"bert.encoder.layer.19.output.dense._input_quantizer._amax\", \"bert.encoder.layer.19.output.LayerNorm.weight\", \"bert.encoder.layer.19.output.LayerNorm.bias\", \"bert.encoder.layer.19.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.19.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.query.weight\", \"bert.encoder.layer.20.attention.self.query.bias\", \"bert.encoder.layer.20.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.key.weight\", \"bert.encoder.layer.20.attention.self.key.bias\", \"bert.encoder.layer.20.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.value.weight\", \"bert.encoder.layer.20.attention.self.value.bias\", \"bert.encoder.layer.20.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.20.attention.output.dense.weight\", \"bert.encoder.layer.20.attention.output.dense.bias\", \"bert.encoder.layer.20.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.20.attention.output.LayerNorm.weight\", \"bert.encoder.layer.20.attention.output.LayerNorm.bias\", \"bert.encoder.layer.20.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.20.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.20.intermediate.dense.weight\", \"bert.encoder.layer.20.intermediate.dense.bias\", \"bert.encoder.layer.20.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.20.output.dense.weight\", \"bert.encoder.layer.20.output.dense.bias\", \"bert.encoder.layer.20.output.dense._input_quantizer._amax\", \"bert.encoder.layer.20.output.LayerNorm.weight\", \"bert.encoder.layer.20.output.LayerNorm.bias\", \"bert.encoder.layer.20.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.20.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.query.weight\", \"bert.encoder.layer.21.attention.self.query.bias\", \"bert.encoder.layer.21.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.key.weight\", \"bert.encoder.layer.21.attention.self.key.bias\", \"bert.encoder.layer.21.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.value.weight\", \"bert.encoder.layer.21.attention.self.value.bias\", \"bert.encoder.layer.21.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.21.attention.output.dense.weight\", \"bert.encoder.layer.21.attention.output.dense.bias\", \"bert.encoder.layer.21.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.21.attention.output.LayerNorm.weight\", \"bert.encoder.layer.21.attention.output.LayerNorm.bias\", \"bert.encoder.layer.21.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.21.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.21.intermediate.dense.weight\", \"bert.encoder.layer.21.intermediate.dense.bias\", \"bert.encoder.layer.21.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.21.output.dense.weight\", \"bert.encoder.layer.21.output.dense.bias\", \"bert.encoder.layer.21.output.dense._input_quantizer._amax\", \"bert.encoder.layer.21.output.LayerNorm.weight\", \"bert.encoder.layer.21.output.LayerNorm.bias\", \"bert.encoder.layer.21.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.21.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.query.weight\", \"bert.encoder.layer.22.attention.self.query.bias\", \"bert.encoder.layer.22.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.key.weight\", \"bert.encoder.layer.22.attention.self.key.bias\", \"bert.encoder.layer.22.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.value.weight\", \"bert.encoder.layer.22.attention.self.value.bias\", \"bert.encoder.layer.22.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.22.attention.output.dense.weight\", \"bert.encoder.layer.22.attention.output.dense.bias\", \"bert.encoder.layer.22.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.22.attention.output.LayerNorm.weight\", \"bert.encoder.layer.22.attention.output.LayerNorm.bias\", \"bert.encoder.layer.22.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.22.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.22.intermediate.dense.weight\", \"bert.encoder.layer.22.intermediate.dense.bias\", \"bert.encoder.layer.22.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.22.output.dense.weight\", \"bert.encoder.layer.22.output.dense.bias\", \"bert.encoder.layer.22.output.dense._input_quantizer._amax\", \"bert.encoder.layer.22.output.LayerNorm.weight\", \"bert.encoder.layer.22.output.LayerNorm.bias\", \"bert.encoder.layer.22.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.22.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.query.weight\", \"bert.encoder.layer.23.attention.self.query.bias\", \"bert.encoder.layer.23.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.key.weight\", \"bert.encoder.layer.23.attention.self.key.bias\", \"bert.encoder.layer.23.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.value.weight\", \"bert.encoder.layer.23.attention.self.value.bias\", \"bert.encoder.layer.23.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.23.attention.output.dense.weight\", \"bert.encoder.layer.23.attention.output.dense.bias\", \"bert.encoder.layer.23.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.23.attention.output.LayerNorm.weight\", \"bert.encoder.layer.23.attention.output.LayerNorm.bias\", \"bert.encoder.layer.23.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.23.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.23.intermediate.dense.weight\", \"bert.encoder.layer.23.intermediate.dense.bias\", \"bert.encoder.layer.23.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.23.output.dense.weight\", \"bert.encoder.layer.23.output.dense.bias\", \"bert.encoder.layer.23.output.dense._input_quantizer._amax\", \"bert.encoder.layer.23.output.LayerNorm.weight\", \"bert.encoder.layer.23.output.LayerNorm.bias\", \"bert.encoder.layer.23.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.23.output.add_residual_input_quantizer._amax\", \"bert.encoder.final_input_quantizer._amax\", \"bert.pooler.dense.weight\", \"bert.pooler.dense.bias\", \"qa_outputs.weight\", \"qa_outputs.bias\". ",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "Input \u001b[0;32mIn [10]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_state_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m/Users/oliver/Downloads/pytorch_model.bin\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcpu\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniforge3/envs/trans-fat/lib/python3.9/site-packages/torch/nn/modules/module.py:1482\u001b[0m, in \u001b[0;36mModule.load_state_dict\u001b[0;34m(self, state_dict, strict)\u001b[0m\n\u001b[1;32m   1477\u001b[0m         error_msgs\u001b[38;5;241m.\u001b[39minsert(\n\u001b[1;32m   1478\u001b[0m             \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMissing key(s) in state_dict: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m. \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m   1479\u001b[0m                 \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(k) \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m missing_keys)))\n\u001b[1;32m   1481\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(error_msgs) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m-> 1482\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mError(s) in loading state_dict for \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m   1483\u001b[0m                        \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(error_msgs)))\n\u001b[1;32m   1484\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _IncompatibleKeys(missing_keys, unexpected_keys)\n",
      "\u001b[0;31mRuntimeError\u001b[0m: Error(s) in loading state_dict for BertModel:\n\tMissing key(s) in state_dict: \"embeddings.position_ids\", \"embeddings.word_embeddings.weight\", \"embeddings.position_embeddings.weight\", \"embeddings.token_type_embeddings.weight\", \"embeddings.LayerNorm.weight\", \"embeddings.LayerNorm.bias\", \"encoder.layer.0.attention.self.query.weight\", \"encoder.layer.0.attention.self.query.bias\", \"encoder.layer.0.attention.self.key.weight\", \"encoder.layer.0.attention.self.key.bias\", \"encoder.layer.0.attention.self.value.weight\", \"encoder.layer.0.attention.self.value.bias\", \"encoder.layer.0.attention.output.dense.weight\", \"encoder.layer.0.attention.output.dense.bias\", \"encoder.layer.0.attention.output.LayerNorm.weight\", \"encoder.layer.0.attention.output.LayerNorm.bias\", \"encoder.layer.0.intermediate.dense.weight\", \"encoder.layer.0.intermediate.dense.bias\", \"encoder.layer.0.output.dense.weight\", \"encoder.layer.0.output.dense.bias\", \"encoder.layer.0.output.LayerNorm.weight\", \"encoder.layer.0.output.LayerNorm.bias\", \"encoder.layer.1.attention.self.query.weight\", \"encoder.layer.1.attention.self.query.bias\", \"encoder.layer.1.attention.self.key.weight\", \"encoder.layer.1.attention.self.key.bias\", \"encoder.layer.1.attention.self.value.weight\", \"encoder.layer.1.attention.self.value.bias\", \"encoder.layer.1.attention.output.dense.weight\", \"encoder.layer.1.attention.output.dense.bias\", \"encoder.layer.1.attention.output.LayerNorm.weight\", \"encoder.layer.1.attention.output.LayerNorm.bias\", \"encoder.layer.1.intermediate.dense.weight\", \"encoder.layer.1.intermediate.dense.bias\", \"encoder.layer.1.output.dense.weight\", \"encoder.layer.1.output.dense.bias\", \"encoder.layer.1.output.LayerNorm.weight\", \"encoder.layer.1.output.LayerNorm.bias\", \"encoder.layer.2.attention.self.query.weight\", \"encoder.layer.2.attention.self.query.bias\", \"encoder.layer.2.attention.self.key.weight\", \"encoder.layer.2.attention.self.key.bias\", \"encoder.layer.2.attention.self.value.weight\", \"encoder.layer.2.attention.self.value.bias\", \"encoder.layer.2.attention.output.dense.weight\", \"encoder.layer.2.attention.output.dense.bias\", \"encoder.layer.2.attention.output.LayerNorm.weight\", \"encoder.layer.2.attention.output.LayerNorm.bias\", \"encoder.layer.2.intermediate.dense.weight\", \"encoder.layer.2.intermediate.dense.bias\", \"encoder.layer.2.output.dense.weight\", \"encoder.layer.2.output.dense.bias\", \"encoder.layer.2.output.LayerNorm.weight\", \"encoder.layer.2.output.LayerNorm.bias\", \"encoder.layer.3.attention.self.query.weight\", \"encoder.layer.3.attention.self.query.bias\", \"encoder.layer.3.attention.self.key.weight\", \"encoder.layer.3.attention.self.key.bias\", \"encoder.layer.3.attention.self.value.weight\", \"encoder.layer.3.attention.self.value.bias\", \"encoder.layer.3.attention.output.dense.weight\", \"encoder.layer.3.attention.output.dense.bias\", \"encoder.layer.3.attention.output.LayerNorm.weight\", \"encoder.layer.3.attention.output.LayerNorm.bias\", \"encoder.layer.3.intermediate.dense.weight\", \"encoder.layer.3.intermediate.dense.bias\", \"encoder.layer.3.output.dense.weight\", \"encoder.layer.3.output.dense.bias\", \"encoder.layer.3.output.LayerNorm.weight\", \"encoder.layer.3.output.LayerNorm.bias\", \"encoder.layer.4.attention.self.query.weight\", \"encoder.layer.4.attention.self.query.bias\", \"encoder.layer.4.attention.self.key.weight\", \"encoder.layer.4.attention.self.key.bias\", \"encoder.layer.4.attention.self.value.weight\", \"encoder.layer.4.attention.self.value.bias\", \"encoder.layer.4.attention.output.dense.weight\", \"encoder.layer.4.attention.output.dense.bias\", \"encoder.layer.4.attention.output.LayerNorm.weight\", \"encoder.layer.4.attention.output.LayerNorm.bias\", \"encoder.layer.4.intermediate.dense.weight\", \"encoder.layer.4.intermediate.dense.bias\", \"encoder.layer.4.output.dense.weight\", \"encoder.layer.4.output.dense.bias\", \"encoder.layer.4.output.LayerNorm.weight\", \"encoder.layer.4.output.LayerNorm.bias\", \"encoder.layer.5.attention.self.query.weight\", \"encoder.layer.5.attention.self.query.bias\", \"encoder.layer.5.attention.self.key.weight\", \"encoder.layer.5.attention.self.key.bias\", \"encoder.layer.5.attention.self.value.weight\", \"encoder.layer.5.attention.self.value.bias\", \"encoder.layer.5.attention.output.dense.weight\", \"encoder.layer.5.attention.output.dense.bias\", \"encoder.layer.5.attention.output.LayerNorm.weight\", \"encoder.layer.5.attention.output.LayerNorm.bias\", \"encoder.layer.5.intermediate.dense.weight\", \"encoder.layer.5.intermediate.dense.bias\", \"encoder.layer.5.output.dense.weight\", \"encoder.layer.5.output.dense.bias\", \"encoder.layer.5.output.LayerNorm.weight\", \"encoder.layer.5.output.LayerNorm.bias\", \"encoder.layer.6.attention.self.query.weight\", \"encoder.layer.6.attention.self.query.bias\", \"encoder.layer.6.attention.self.key.weight\", \"encoder.layer.6.attention.self.key.bias\", \"encoder.layer.6.attention.self.value.weight\", \"encoder.layer.6.attention.self.value.bias\", \"encoder.layer.6.attention.output.dense.weight\", \"encoder.layer.6.attention.output.dense.bias\", \"encoder.layer.6.attention.output.LayerNorm.weight\", \"encoder.layer.6.attention.output.LayerNorm.bias\", \"encoder.layer.6.intermediate.dense.weight\", \"encoder.layer.6.intermediate.dense.bias\", \"encoder.layer.6.output.dense.weight\", \"encoder.layer.6.output.dense.bias\", \"encoder.layer.6.output.LayerNorm.weight\", \"encoder.layer.6.output.LayerNorm.bias\", \"encoder.layer.7.attention.self.query.weight\", \"encoder.layer.7.attention.self.query.bias\", \"encoder.layer.7.attention.self.key.weight\", \"encoder.layer.7.attention.self.key.bias\", \"encoder.layer.7.attention.self.value.weight\", \"encoder.layer.7.attention.self.value.bias\", \"encoder.layer.7.attention.output.dense.weight\", \"encoder.layer.7.attention.output.dense.bias\", \"encoder.layer.7.attention.output.LayerNorm.weight\", \"encoder.layer.7.attention.output.LayerNorm.bias\", \"encoder.layer.7.intermediate.dense.weight\", \"encoder.layer.7.intermediate.dense.bias\", \"encoder.layer.7.output.dense.weight\", \"encoder.layer.7.output.dense.bias\", \"encoder.layer.7.output.LayerNorm.weight\", \"encoder.layer.7.output.LayerNorm.bias\", \"encoder.layer.8.attention.self.query.weight\", \"encoder.layer.8.attention.self.query.bias\", \"encoder.layer.8.attention.self.key.weight\", \"encoder.layer.8.attention.self.key.bias\", \"encoder.layer.8.attention.self.value.weight\", \"encoder.layer.8.attention.self.value.bias\", \"encoder.layer.8.attention.output.dense.weight\", \"encoder.layer.8.attention.output.dense.bias\", \"encoder.layer.8.attention.output.LayerNorm.weight\", \"encoder.layer.8.attention.output.LayerNorm.bias\", \"encoder.layer.8.intermediate.dense.weight\", \"encoder.layer.8.intermediate.dense.bias\", \"encoder.layer.8.output.dense.weight\", \"encoder.layer.8.output.dense.bias\", \"encoder.layer.8.output.LayerNorm.weight\", \"encoder.layer.8.output.LayerNorm.bias\", \"encoder.layer.9.attention.self.query.weight\", \"encoder.layer.9.attention.self.query.bias\", \"encoder.layer.9.attention.self.key.weight\", \"encoder.layer.9.attention.self.key.bias\", \"encoder.layer.9.attention.self.value.weight\", \"encoder.layer.9.attention.self.value.bias\", \"encoder.layer.9.attention.output.dense.weight\", \"encoder.layer.9.attention.output.dense.bias\", \"encoder.layer.9.attention.output.LayerNorm.weight\", \"encoder.layer.9.attention.output.LayerNorm.bias\", \"encoder.layer.9.intermediate.dense.weight\", \"encoder.layer.9.intermediate.dense.bias\", \"encoder.layer.9.output.dense.weight\", \"encoder.layer.9.output.dense.bias\", \"encoder.layer.9.output.LayerNorm.weight\", \"encoder.layer.9.output.LayerNorm.bias\", \"encoder.layer.10.attention.self.query.weight\", \"encoder.layer.10.attention.self.query.bias\", \"encoder.layer.10.attention.self.key.weight\", \"encoder.layer.10.attention.self.key.bias\", \"encoder.layer.10.attention.self.value.weight\", \"encoder.layer.10.attention.self.value.bias\", \"encoder.layer.10.attention.output.dense.weight\", \"encoder.layer.10.attention.output.dense.bias\", \"encoder.layer.10.attention.output.LayerNorm.weight\", \"encoder.layer.10.attention.output.LayerNorm.bias\", \"encoder.layer.10.intermediate.dense.weight\", \"encoder.layer.10.intermediate.dense.bias\", \"encoder.layer.10.output.dense.weight\", \"encoder.layer.10.output.dense.bias\", \"encoder.layer.10.output.LayerNorm.weight\", \"encoder.layer.10.output.LayerNorm.bias\", \"encoder.layer.11.attention.self.query.weight\", \"encoder.layer.11.attention.self.query.bias\", \"encoder.layer.11.attention.self.key.weight\", \"encoder.layer.11.attention.self.key.bias\", \"encoder.layer.11.attention.self.value.weight\", \"encoder.layer.11.attention.self.value.bias\", \"encoder.layer.11.attention.output.dense.weight\", \"encoder.layer.11.attention.output.dense.bias\", \"encoder.layer.11.attention.output.LayerNorm.weight\", \"encoder.layer.11.attention.output.LayerNorm.bias\", \"encoder.layer.11.intermediate.dense.weight\", \"encoder.layer.11.intermediate.dense.bias\", \"encoder.layer.11.output.dense.weight\", \"encoder.layer.11.output.dense.bias\", \"encoder.layer.11.output.LayerNorm.weight\", \"encoder.layer.11.output.LayerNorm.bias\", \"encoder.layer.12.attention.self.query.weight\", \"encoder.layer.12.attention.self.query.bias\", \"encoder.layer.12.attention.self.key.weight\", \"encoder.layer.12.attention.self.key.bias\", \"encoder.layer.12.attention.self.value.weight\", \"encoder.layer.12.attention.self.value.bias\", \"encoder.layer.12.attention.output.dense.weight\", \"encoder.layer.12.attention.output.dense.bias\", \"encoder.layer.12.attention.output.LayerNorm.weight\", \"encoder.layer.12.attention.output.LayerNorm.bias\", \"encoder.layer.12.intermediate.dense.weight\", \"encoder.layer.12.intermediate.dense.bias\", \"encoder.layer.12.output.dense.weight\", \"encoder.layer.12.output.dense.bias\", \"encoder.layer.12.output.LayerNorm.weight\", \"encoder.layer.12.output.LayerNorm.bias\", \"encoder.layer.13.attention.self.query.weight\", \"encoder.layer.13.attention.self.query.bias\", \"encoder.layer.13.attention.self.key.weight\", \"encoder.layer.13.attention.self.key.bias\", \"encoder.layer.13.attention.self.value.weight\", \"encoder.layer.13.attention.self.value.bias\", \"encoder.layer.13.attention.output.dense.weight\", \"encoder.layer.13.attention.output.dense.bias\", \"encoder.layer.13.attention.output.LayerNorm.weight\", \"encoder.layer.13.attention.output.LayerNorm.bias\", \"encoder.layer.13.intermediate.dense.weight\", \"encoder.layer.13.intermediate.dense.bias\", \"encoder.layer.13.output.dense.weight\", \"encoder.layer.13.output.dense.bias\", \"encoder.layer.13.output.LayerNorm.weight\", \"encoder.layer.13.output.LayerNorm.bias\", \"encoder.layer.14.attention.self.query.weight\", \"encoder.layer.14.attention.self.query.bias\", \"encoder.layer.14.attention.self.key.weight\", \"encoder.layer.14.attention.self.key.bias\", \"encoder.layer.14.attention.self.value.weight\", \"encoder.layer.14.attention.self.value.bias\", \"encoder.layer.14.attention.output.dense.weight\", \"encoder.layer.14.attention.output.dense.bias\", \"encoder.layer.14.attention.output.LayerNorm.weight\", \"encoder.layer.14.attention.output.LayerNorm.bias\", \"encoder.layer.14.intermediate.dense.weight\", \"encoder.layer.14.intermediate.dense.bias\", \"encoder.layer.14.output.dense.weight\", \"encoder.layer.14.output.dense.bias\", \"encoder.layer.14.output.LayerNorm.weight\", \"encoder.layer.14.output.LayerNorm.bias\", \"encoder.layer.15.attention.self.query.weight\", \"encoder.layer.15.attention.self.query.bias\", \"encoder.layer.15.attention.self.key.weight\", \"encoder.layer.15.attention.self.key.bias\", \"encoder.layer.15.attention.self.value.weight\", \"encoder.layer.15.attention.self.value.bias\", \"encoder.layer.15.attention.output.dense.weight\", \"encoder.layer.15.attention.output.dense.bias\", \"encoder.layer.15.attention.output.LayerNorm.weight\", \"encoder.layer.15.attention.output.LayerNorm.bias\", \"encoder.layer.15.intermediate.dense.weight\", \"encoder.layer.15.intermediate.dense.bias\", \"encoder.layer.15.output.dense.weight\", \"encoder.layer.15.output.dense.bias\", \"encoder.layer.15.output.LayerNorm.weight\", \"encoder.layer.15.output.LayerNorm.bias\", \"encoder.layer.16.attention.self.query.weight\", \"encoder.layer.16.attention.self.query.bias\", \"encoder.layer.16.attention.self.key.weight\", \"encoder.layer.16.attention.self.key.bias\", \"encoder.layer.16.attention.self.value.weight\", \"encoder.layer.16.attention.self.value.bias\", \"encoder.layer.16.attention.output.dense.weight\", \"encoder.layer.16.attention.output.dense.bias\", \"encoder.layer.16.attention.output.LayerNorm.weight\", \"encoder.layer.16.attention.output.LayerNorm.bias\", \"encoder.layer.16.intermediate.dense.weight\", \"encoder.layer.16.intermediate.dense.bias\", \"encoder.layer.16.output.dense.weight\", \"encoder.layer.16.output.dense.bias\", \"encoder.layer.16.output.LayerNorm.weight\", \"encoder.layer.16.output.LayerNorm.bias\", \"encoder.layer.17.attention.self.query.weight\", \"encoder.layer.17.attention.self.query.bias\", \"encoder.layer.17.attention.self.key.weight\", \"encoder.layer.17.attention.self.key.bias\", \"encoder.layer.17.attention.self.value.weight\", \"encoder.layer.17.attention.self.value.bias\", \"encoder.layer.17.attention.output.dense.weight\", \"encoder.layer.17.attention.output.dense.bias\", \"encoder.layer.17.attention.output.LayerNorm.weight\", \"encoder.layer.17.attention.output.LayerNorm.bias\", \"encoder.layer.17.intermediate.dense.weight\", \"encoder.layer.17.intermediate.dense.bias\", \"encoder.layer.17.output.dense.weight\", \"encoder.layer.17.output.dense.bias\", \"encoder.layer.17.output.LayerNorm.weight\", \"encoder.layer.17.output.LayerNorm.bias\", \"encoder.layer.18.attention.self.query.weight\", \"encoder.layer.18.attention.self.query.bias\", \"encoder.layer.18.attention.self.key.weight\", \"encoder.layer.18.attention.self.key.bias\", \"encoder.layer.18.attention.self.value.weight\", \"encoder.layer.18.attention.self.value.bias\", \"encoder.layer.18.attention.output.dense.weight\", \"encoder.layer.18.attention.output.dense.bias\", \"encoder.layer.18.attention.output.LayerNorm.weight\", \"encoder.layer.18.attention.output.LayerNorm.bias\", \"encoder.layer.18.intermediate.dense.weight\", \"encoder.layer.18.intermediate.dense.bias\", \"encoder.layer.18.output.dense.weight\", \"encoder.layer.18.output.dense.bias\", \"encoder.layer.18.output.LayerNorm.weight\", \"encoder.layer.18.output.LayerNorm.bias\", \"encoder.layer.19.attention.self.query.weight\", \"encoder.layer.19.attention.self.query.bias\", \"encoder.layer.19.attention.self.key.weight\", \"encoder.layer.19.attention.self.key.bias\", \"encoder.layer.19.attention.self.value.weight\", \"encoder.layer.19.attention.self.value.bias\", \"encoder.layer.19.attention.output.dense.weight\", \"encoder.layer.19.attention.output.dense.bias\", \"encoder.layer.19.attention.output.LayerNorm.weight\", \"encoder.layer.19.attention.output.LayerNorm.bias\", \"encoder.layer.19.intermediate.dense.weight\", \"encoder.layer.19.intermediate.dense.bias\", \"encoder.layer.19.output.dense.weight\", \"encoder.layer.19.output.dense.bias\", \"encoder.layer.19.output.LayerNorm.weight\", \"encoder.layer.19.output.LayerNorm.bias\", \"encoder.layer.20.attention.self.query.weight\", \"encoder.layer.20.attention.self.query.bias\", \"encoder.layer.20.attention.self.key.weight\", \"encoder.layer.20.attention.self.key.bias\", \"encoder.layer.20.attention.self.value.weight\", \"encoder.layer.20.attention.self.value.bias\", \"encoder.layer.20.attention.output.dense.weight\", \"encoder.layer.20.attention.output.dense.bias\", \"encoder.layer.20.attention.output.LayerNorm.weight\", \"encoder.layer.20.attention.output.LayerNorm.bias\", \"encoder.layer.20.intermediate.dense.weight\", \"encoder.layer.20.intermediate.dense.bias\", \"encoder.layer.20.output.dense.weight\", \"encoder.layer.20.output.dense.bias\", \"encoder.layer.20.output.LayerNorm.weight\", \"encoder.layer.20.output.LayerNorm.bias\", \"encoder.layer.21.attention.self.query.weight\", \"encoder.layer.21.attention.self.query.bias\", \"encoder.layer.21.attention.self.key.weight\", \"encoder.layer.21.attention.self.key.bias\", \"encoder.layer.21.attention.self.value.weight\", \"encoder.layer.21.attention.self.value.bias\", \"encoder.layer.21.attention.output.dense.weight\", \"encoder.layer.21.attention.output.dense.bias\", \"encoder.layer.21.attention.output.LayerNorm.weight\", \"encoder.layer.21.attention.output.LayerNorm.bias\", \"encoder.layer.21.intermediate.dense.weight\", \"encoder.layer.21.intermediate.dense.bias\", \"encoder.layer.21.output.dense.weight\", \"encoder.layer.21.output.dense.bias\", \"encoder.layer.21.output.LayerNorm.weight\", \"encoder.layer.21.output.LayerNorm.bias\", \"encoder.layer.22.attention.self.query.weight\", \"encoder.layer.22.attention.self.query.bias\", \"encoder.layer.22.attention.self.key.weight\", \"encoder.layer.22.attention.self.key.bias\", \"encoder.layer.22.attention.self.value.weight\", \"encoder.layer.22.attention.self.value.bias\", \"encoder.layer.22.attention.output.dense.weight\", \"encoder.layer.22.attention.output.dense.bias\", \"encoder.layer.22.attention.output.LayerNorm.weight\", \"encoder.layer.22.attention.output.LayerNorm.bias\", \"encoder.layer.22.intermediate.dense.weight\", \"encoder.layer.22.intermediate.dense.bias\", \"encoder.layer.22.output.dense.weight\", \"encoder.layer.22.output.dense.bias\", \"encoder.layer.22.output.LayerNorm.weight\", \"encoder.layer.22.output.LayerNorm.bias\", \"encoder.layer.23.attention.self.query.weight\", \"encoder.layer.23.attention.self.query.bias\", \"encoder.layer.23.attention.self.key.weight\", \"encoder.layer.23.attention.self.key.bias\", \"encoder.layer.23.attention.self.value.weight\", \"encoder.layer.23.attention.self.value.bias\", \"encoder.layer.23.attention.output.dense.weight\", \"encoder.layer.23.attention.output.dense.bias\", \"encoder.layer.23.attention.output.LayerNorm.weight\", \"encoder.layer.23.attention.output.LayerNorm.bias\", \"encoder.layer.23.intermediate.dense.weight\", \"encoder.layer.23.intermediate.dense.bias\", \"encoder.layer.23.output.dense.weight\", \"encoder.layer.23.output.dense.bias\", \"encoder.layer.23.output.LayerNorm.weight\", \"encoder.layer.23.output.LayerNorm.bias\", \"pooler.dense.weight\", \"pooler.dense.bias\". \n\tUnexpected key(s) in state_dict: \"bert.embeddings.word_embeddings.weight\", \"bert.embeddings.position_embeddings.weight\", \"bert.embeddings.token_type_embeddings.weight\", \"bert.embeddings.LayerNorm.weight\", \"bert.embeddings.LayerNorm.bias\", \"bert.encoder.layer.0.attention.self.query.weight\", \"bert.encoder.layer.0.attention.self.query.bias\", \"bert.encoder.layer.0.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.key.weight\", \"bert.encoder.layer.0.attention.self.key.bias\", \"bert.encoder.layer.0.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.value.weight\", \"bert.encoder.layer.0.attention.self.value.bias\", \"bert.encoder.layer.0.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.0.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.0.attention.output.dense.weight\", \"bert.encoder.layer.0.attention.output.dense.bias\", \"bert.encoder.layer.0.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.0.attention.output.LayerNorm.weight\", \"bert.encoder.layer.0.attention.output.LayerNorm.bias\", \"bert.encoder.layer.0.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.0.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.0.intermediate.dense.weight\", \"bert.encoder.layer.0.intermediate.dense.bias\", \"bert.encoder.layer.0.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.0.output.dense.weight\", \"bert.encoder.layer.0.output.dense.bias\", \"bert.encoder.layer.0.output.dense._input_quantizer._amax\", \"bert.encoder.layer.0.output.LayerNorm.weight\", \"bert.encoder.layer.0.output.LayerNorm.bias\", \"bert.encoder.layer.0.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.0.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.query.weight\", \"bert.encoder.layer.1.attention.self.query.bias\", \"bert.encoder.layer.1.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.key.weight\", \"bert.encoder.layer.1.attention.self.key.bias\", \"bert.encoder.layer.1.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.value.weight\", \"bert.encoder.layer.1.attention.self.value.bias\", \"bert.encoder.layer.1.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.1.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.1.attention.output.dense.weight\", \"bert.encoder.layer.1.attention.output.dense.bias\", \"bert.encoder.layer.1.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.1.attention.output.LayerNorm.weight\", \"bert.encoder.layer.1.attention.output.LayerNorm.bias\", \"bert.encoder.layer.1.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.1.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.1.intermediate.dense.weight\", \"bert.encoder.layer.1.intermediate.dense.bias\", \"bert.encoder.layer.1.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.1.output.dense.weight\", \"bert.encoder.layer.1.output.dense.bias\", \"bert.encoder.layer.1.output.dense._input_quantizer._amax\", \"bert.encoder.layer.1.output.LayerNorm.weight\", \"bert.encoder.layer.1.output.LayerNorm.bias\", \"bert.encoder.layer.1.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.1.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.query.weight\", \"bert.encoder.layer.2.attention.self.query.bias\", \"bert.encoder.layer.2.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.key.weight\", \"bert.encoder.layer.2.attention.self.key.bias\", \"bert.encoder.layer.2.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.value.weight\", \"bert.encoder.layer.2.attention.self.value.bias\", \"bert.encoder.layer.2.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.2.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.2.attention.output.dense.weight\", \"bert.encoder.layer.2.attention.output.dense.bias\", \"bert.encoder.layer.2.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.2.attention.output.LayerNorm.weight\", \"bert.encoder.layer.2.attention.output.LayerNorm.bias\", \"bert.encoder.layer.2.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.2.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.2.intermediate.dense.weight\", \"bert.encoder.layer.2.intermediate.dense.bias\", \"bert.encoder.layer.2.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.2.output.dense.weight\", \"bert.encoder.layer.2.output.dense.bias\", \"bert.encoder.layer.2.output.dense._input_quantizer._amax\", \"bert.encoder.layer.2.output.LayerNorm.weight\", \"bert.encoder.layer.2.output.LayerNorm.bias\", \"bert.encoder.layer.2.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.2.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.query.weight\", \"bert.encoder.layer.3.attention.self.query.bias\", \"bert.encoder.layer.3.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.key.weight\", \"bert.encoder.layer.3.attention.self.key.bias\", \"bert.encoder.layer.3.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.value.weight\", \"bert.encoder.layer.3.attention.self.value.bias\", \"bert.encoder.layer.3.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.3.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.3.attention.output.dense.weight\", \"bert.encoder.layer.3.attention.output.dense.bias\", \"bert.encoder.layer.3.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.3.attention.output.LayerNorm.weight\", \"bert.encoder.layer.3.attention.output.LayerNorm.bias\", \"bert.encoder.layer.3.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.3.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.3.intermediate.dense.weight\", \"bert.encoder.layer.3.intermediate.dense.bias\", \"bert.encoder.layer.3.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.3.output.dense.weight\", \"bert.encoder.layer.3.output.dense.bias\", \"bert.encoder.layer.3.output.dense._input_quantizer._amax\", \"bert.encoder.layer.3.output.LayerNorm.weight\", \"bert.encoder.layer.3.output.LayerNorm.bias\", \"bert.encoder.layer.3.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.3.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.query.weight\", \"bert.encoder.layer.4.attention.self.query.bias\", \"bert.encoder.layer.4.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.key.weight\", \"bert.encoder.layer.4.attention.self.key.bias\", \"bert.encoder.layer.4.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.value.weight\", \"bert.encoder.layer.4.attention.self.value.bias\", \"bert.encoder.layer.4.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.4.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.4.attention.output.dense.weight\", \"bert.encoder.layer.4.attention.output.dense.bias\", \"bert.encoder.layer.4.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.4.attention.output.LayerNorm.weight\", \"bert.encoder.layer.4.attention.output.LayerNorm.bias\", \"bert.encoder.layer.4.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.4.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.4.intermediate.dense.weight\", \"bert.encoder.layer.4.intermediate.dense.bias\", \"bert.encoder.layer.4.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.4.output.dense.weight\", \"bert.encoder.layer.4.output.dense.bias\", \"bert.encoder.layer.4.output.dense._input_quantizer._amax\", \"bert.encoder.layer.4.output.LayerNorm.weight\", \"bert.encoder.layer.4.output.LayerNorm.bias\", \"bert.encoder.layer.4.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.4.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.query.weight\", \"bert.encoder.layer.5.attention.self.query.bias\", \"bert.encoder.layer.5.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.key.weight\", \"bert.encoder.layer.5.attention.self.key.bias\", \"bert.encoder.layer.5.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.value.weight\", \"bert.encoder.layer.5.attention.self.value.bias\", \"bert.encoder.layer.5.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.5.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.5.attention.output.dense.weight\", \"bert.encoder.layer.5.attention.output.dense.bias\", \"bert.encoder.layer.5.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.5.attention.output.LayerNorm.weight\", \"bert.encoder.layer.5.attention.output.LayerNorm.bias\", \"bert.encoder.layer.5.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.5.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.5.intermediate.dense.weight\", \"bert.encoder.layer.5.intermediate.dense.bias\", \"bert.encoder.layer.5.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.5.output.dense.weight\", \"bert.encoder.layer.5.output.dense.bias\", \"bert.encoder.layer.5.output.dense._input_quantizer._amax\", \"bert.encoder.layer.5.output.LayerNorm.weight\", \"bert.encoder.layer.5.output.LayerNorm.bias\", \"bert.encoder.layer.5.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.5.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.query.weight\", \"bert.encoder.layer.6.attention.self.query.bias\", \"bert.encoder.layer.6.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.key.weight\", \"bert.encoder.layer.6.attention.self.key.bias\", \"bert.encoder.layer.6.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.value.weight\", \"bert.encoder.layer.6.attention.self.value.bias\", \"bert.encoder.layer.6.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.6.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.6.attention.output.dense.weight\", \"bert.encoder.layer.6.attention.output.dense.bias\", \"bert.encoder.layer.6.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.6.attention.output.LayerNorm.weight\", \"bert.encoder.layer.6.attention.output.LayerNorm.bias\", \"bert.encoder.layer.6.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.6.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.6.intermediate.dense.weight\", \"bert.encoder.layer.6.intermediate.dense.bias\", \"bert.encoder.layer.6.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.6.output.dense.weight\", \"bert.encoder.layer.6.output.dense.bias\", \"bert.encoder.layer.6.output.dense._input_quantizer._amax\", \"bert.encoder.layer.6.output.LayerNorm.weight\", \"bert.encoder.layer.6.output.LayerNorm.bias\", \"bert.encoder.layer.6.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.6.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.query.weight\", \"bert.encoder.layer.7.attention.self.query.bias\", \"bert.encoder.layer.7.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.key.weight\", \"bert.encoder.layer.7.attention.self.key.bias\", \"bert.encoder.layer.7.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.value.weight\", \"bert.encoder.layer.7.attention.self.value.bias\", \"bert.encoder.layer.7.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.7.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.7.attention.output.dense.weight\", \"bert.encoder.layer.7.attention.output.dense.bias\", \"bert.encoder.layer.7.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.7.attention.output.LayerNorm.weight\", \"bert.encoder.layer.7.attention.output.LayerNorm.bias\", \"bert.encoder.layer.7.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.7.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.7.intermediate.dense.weight\", \"bert.encoder.layer.7.intermediate.dense.bias\", \"bert.encoder.layer.7.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.7.output.dense.weight\", \"bert.encoder.layer.7.output.dense.bias\", \"bert.encoder.layer.7.output.dense._input_quantizer._amax\", \"bert.encoder.layer.7.output.LayerNorm.weight\", \"bert.encoder.layer.7.output.LayerNorm.bias\", \"bert.encoder.layer.7.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.7.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.query.weight\", \"bert.encoder.layer.8.attention.self.query.bias\", \"bert.encoder.layer.8.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.key.weight\", \"bert.encoder.layer.8.attention.self.key.bias\", \"bert.encoder.layer.8.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.value.weight\", \"bert.encoder.layer.8.attention.self.value.bias\", \"bert.encoder.layer.8.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.8.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.8.attention.output.dense.weight\", \"bert.encoder.layer.8.attention.output.dense.bias\", \"bert.encoder.layer.8.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.8.attention.output.LayerNorm.weight\", \"bert.encoder.layer.8.attention.output.LayerNorm.bias\", \"bert.encoder.layer.8.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.8.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.8.intermediate.dense.weight\", \"bert.encoder.layer.8.intermediate.dense.bias\", \"bert.encoder.layer.8.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.8.output.dense.weight\", \"bert.encoder.layer.8.output.dense.bias\", \"bert.encoder.layer.8.output.dense._input_quantizer._amax\", \"bert.encoder.layer.8.output.LayerNorm.weight\", \"bert.encoder.layer.8.output.LayerNorm.bias\", \"bert.encoder.layer.8.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.8.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.query.weight\", \"bert.encoder.layer.9.attention.self.query.bias\", \"bert.encoder.layer.9.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.key.weight\", \"bert.encoder.layer.9.attention.self.key.bias\", \"bert.encoder.layer.9.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.value.weight\", \"bert.encoder.layer.9.attention.self.value.bias\", \"bert.encoder.layer.9.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.9.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.9.attention.output.dense.weight\", \"bert.encoder.layer.9.attention.output.dense.bias\", \"bert.encoder.layer.9.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.9.attention.output.LayerNorm.weight\", \"bert.encoder.layer.9.attention.output.LayerNorm.bias\", \"bert.encoder.layer.9.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.9.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.9.intermediate.dense.weight\", \"bert.encoder.layer.9.intermediate.dense.bias\", \"bert.encoder.layer.9.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.9.output.dense.weight\", \"bert.encoder.layer.9.output.dense.bias\", \"bert.encoder.layer.9.output.dense._input_quantizer._amax\", \"bert.encoder.layer.9.output.LayerNorm.weight\", \"bert.encoder.layer.9.output.LayerNorm.bias\", \"bert.encoder.layer.9.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.9.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.query.weight\", \"bert.encoder.layer.10.attention.self.query.bias\", \"bert.encoder.layer.10.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.key.weight\", \"bert.encoder.layer.10.attention.self.key.bias\", \"bert.encoder.layer.10.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.value.weight\", \"bert.encoder.layer.10.attention.self.value.bias\", \"bert.encoder.layer.10.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.10.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.10.attention.output.dense.weight\", \"bert.encoder.layer.10.attention.output.dense.bias\", \"bert.encoder.layer.10.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.10.attention.output.LayerNorm.weight\", \"bert.encoder.layer.10.attention.output.LayerNorm.bias\", \"bert.encoder.layer.10.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.10.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.10.intermediate.dense.weight\", \"bert.encoder.layer.10.intermediate.dense.bias\", \"bert.encoder.layer.10.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.10.output.dense.weight\", \"bert.encoder.layer.10.output.dense.bias\", \"bert.encoder.layer.10.output.dense._input_quantizer._amax\", \"bert.encoder.layer.10.output.LayerNorm.weight\", \"bert.encoder.layer.10.output.LayerNorm.bias\", \"bert.encoder.layer.10.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.10.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.query.weight\", \"bert.encoder.layer.11.attention.self.query.bias\", \"bert.encoder.layer.11.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.key.weight\", \"bert.encoder.layer.11.attention.self.key.bias\", \"bert.encoder.layer.11.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.value.weight\", \"bert.encoder.layer.11.attention.self.value.bias\", \"bert.encoder.layer.11.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.11.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.11.attention.output.dense.weight\", \"bert.encoder.layer.11.attention.output.dense.bias\", \"bert.encoder.layer.11.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.11.attention.output.LayerNorm.weight\", \"bert.encoder.layer.11.attention.output.LayerNorm.bias\", \"bert.encoder.layer.11.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.11.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.11.intermediate.dense.weight\", \"bert.encoder.layer.11.intermediate.dense.bias\", \"bert.encoder.layer.11.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.11.output.dense.weight\", \"bert.encoder.layer.11.output.dense.bias\", \"bert.encoder.layer.11.output.dense._input_quantizer._amax\", \"bert.encoder.layer.11.output.LayerNorm.weight\", \"bert.encoder.layer.11.output.LayerNorm.bias\", \"bert.encoder.layer.11.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.11.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.query.weight\", \"bert.encoder.layer.12.attention.self.query.bias\", \"bert.encoder.layer.12.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.key.weight\", \"bert.encoder.layer.12.attention.self.key.bias\", \"bert.encoder.layer.12.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.value.weight\", \"bert.encoder.layer.12.attention.self.value.bias\", \"bert.encoder.layer.12.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.12.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.12.attention.output.dense.weight\", \"bert.encoder.layer.12.attention.output.dense.bias\", \"bert.encoder.layer.12.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.12.attention.output.LayerNorm.weight\", \"bert.encoder.layer.12.attention.output.LayerNorm.bias\", \"bert.encoder.layer.12.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.12.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.12.intermediate.dense.weight\", \"bert.encoder.layer.12.intermediate.dense.bias\", \"bert.encoder.layer.12.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.12.output.dense.weight\", \"bert.encoder.layer.12.output.dense.bias\", \"bert.encoder.layer.12.output.dense._input_quantizer._amax\", \"bert.encoder.layer.12.output.LayerNorm.weight\", \"bert.encoder.layer.12.output.LayerNorm.bias\", \"bert.encoder.layer.12.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.12.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.query.weight\", \"bert.encoder.layer.13.attention.self.query.bias\", \"bert.encoder.layer.13.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.key.weight\", \"bert.encoder.layer.13.attention.self.key.bias\", \"bert.encoder.layer.13.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.value.weight\", \"bert.encoder.layer.13.attention.self.value.bias\", \"bert.encoder.layer.13.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.13.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.13.attention.output.dense.weight\", \"bert.encoder.layer.13.attention.output.dense.bias\", \"bert.encoder.layer.13.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.13.attention.output.LayerNorm.weight\", \"bert.encoder.layer.13.attention.output.LayerNorm.bias\", \"bert.encoder.layer.13.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.13.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.13.intermediate.dense.weight\", \"bert.encoder.layer.13.intermediate.dense.bias\", \"bert.encoder.layer.13.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.13.output.dense.weight\", \"bert.encoder.layer.13.output.dense.bias\", \"bert.encoder.layer.13.output.dense._input_quantizer._amax\", \"bert.encoder.layer.13.output.LayerNorm.weight\", \"bert.encoder.layer.13.output.LayerNorm.bias\", \"bert.encoder.layer.13.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.13.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.query.weight\", \"bert.encoder.layer.14.attention.self.query.bias\", \"bert.encoder.layer.14.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.key.weight\", \"bert.encoder.layer.14.attention.self.key.bias\", \"bert.encoder.layer.14.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.value.weight\", \"bert.encoder.layer.14.attention.self.value.bias\", \"bert.encoder.layer.14.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.14.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.14.attention.output.dense.weight\", \"bert.encoder.layer.14.attention.output.dense.bias\", \"bert.encoder.layer.14.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.14.attention.output.LayerNorm.weight\", \"bert.encoder.layer.14.attention.output.LayerNorm.bias\", \"bert.encoder.layer.14.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.14.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.14.intermediate.dense.weight\", \"bert.encoder.layer.14.intermediate.dense.bias\", \"bert.encoder.layer.14.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.14.output.dense.weight\", \"bert.encoder.layer.14.output.dense.bias\", \"bert.encoder.layer.14.output.dense._input_quantizer._amax\", \"bert.encoder.layer.14.output.LayerNorm.weight\", \"bert.encoder.layer.14.output.LayerNorm.bias\", \"bert.encoder.layer.14.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.14.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.query.weight\", \"bert.encoder.layer.15.attention.self.query.bias\", \"bert.encoder.layer.15.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.key.weight\", \"bert.encoder.layer.15.attention.self.key.bias\", \"bert.encoder.layer.15.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.value.weight\", \"bert.encoder.layer.15.attention.self.value.bias\", \"bert.encoder.layer.15.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.15.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.15.attention.output.dense.weight\", \"bert.encoder.layer.15.attention.output.dense.bias\", \"bert.encoder.layer.15.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.15.attention.output.LayerNorm.weight\", \"bert.encoder.layer.15.attention.output.LayerNorm.bias\", \"bert.encoder.layer.15.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.15.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.15.intermediate.dense.weight\", \"bert.encoder.layer.15.intermediate.dense.bias\", \"bert.encoder.layer.15.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.15.output.dense.weight\", \"bert.encoder.layer.15.output.dense.bias\", \"bert.encoder.layer.15.output.dense._input_quantizer._amax\", \"bert.encoder.layer.15.output.LayerNorm.weight\", \"bert.encoder.layer.15.output.LayerNorm.bias\", \"bert.encoder.layer.15.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.15.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.query.weight\", \"bert.encoder.layer.16.attention.self.query.bias\", \"bert.encoder.layer.16.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.key.weight\", \"bert.encoder.layer.16.attention.self.key.bias\", \"bert.encoder.layer.16.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.value.weight\", \"bert.encoder.layer.16.attention.self.value.bias\", \"bert.encoder.layer.16.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.16.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.16.attention.output.dense.weight\", \"bert.encoder.layer.16.attention.output.dense.bias\", \"bert.encoder.layer.16.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.16.attention.output.LayerNorm.weight\", \"bert.encoder.layer.16.attention.output.LayerNorm.bias\", \"bert.encoder.layer.16.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.16.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.16.intermediate.dense.weight\", \"bert.encoder.layer.16.intermediate.dense.bias\", \"bert.encoder.layer.16.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.16.output.dense.weight\", \"bert.encoder.layer.16.output.dense.bias\", \"bert.encoder.layer.16.output.dense._input_quantizer._amax\", \"bert.encoder.layer.16.output.LayerNorm.weight\", \"bert.encoder.layer.16.output.LayerNorm.bias\", \"bert.encoder.layer.16.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.16.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.query.weight\", \"bert.encoder.layer.17.attention.self.query.bias\", \"bert.encoder.layer.17.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.key.weight\", \"bert.encoder.layer.17.attention.self.key.bias\", \"bert.encoder.layer.17.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.value.weight\", \"bert.encoder.layer.17.attention.self.value.bias\", \"bert.encoder.layer.17.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.17.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.17.attention.output.dense.weight\", \"bert.encoder.layer.17.attention.output.dense.bias\", \"bert.encoder.layer.17.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.17.attention.output.LayerNorm.weight\", \"bert.encoder.layer.17.attention.output.LayerNorm.bias\", \"bert.encoder.layer.17.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.17.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.17.intermediate.dense.weight\", \"bert.encoder.layer.17.intermediate.dense.bias\", \"bert.encoder.layer.17.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.17.output.dense.weight\", \"bert.encoder.layer.17.output.dense.bias\", \"bert.encoder.layer.17.output.dense._input_quantizer._amax\", \"bert.encoder.layer.17.output.LayerNorm.weight\", \"bert.encoder.layer.17.output.LayerNorm.bias\", \"bert.encoder.layer.17.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.17.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.query.weight\", \"bert.encoder.layer.18.attention.self.query.bias\", \"bert.encoder.layer.18.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.key.weight\", \"bert.encoder.layer.18.attention.self.key.bias\", \"bert.encoder.layer.18.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.value.weight\", \"bert.encoder.layer.18.attention.self.value.bias\", \"bert.encoder.layer.18.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.18.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.18.attention.output.dense.weight\", \"bert.encoder.layer.18.attention.output.dense.bias\", \"bert.encoder.layer.18.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.18.attention.output.LayerNorm.weight\", \"bert.encoder.layer.18.attention.output.LayerNorm.bias\", \"bert.encoder.layer.18.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.18.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.18.intermediate.dense.weight\", \"bert.encoder.layer.18.intermediate.dense.bias\", \"bert.encoder.layer.18.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.18.output.dense.weight\", \"bert.encoder.layer.18.output.dense.bias\", \"bert.encoder.layer.18.output.dense._input_quantizer._amax\", \"bert.encoder.layer.18.output.LayerNorm.weight\", \"bert.encoder.layer.18.output.LayerNorm.bias\", \"bert.encoder.layer.18.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.18.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.query.weight\", \"bert.encoder.layer.19.attention.self.query.bias\", \"bert.encoder.layer.19.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.key.weight\", \"bert.encoder.layer.19.attention.self.key.bias\", \"bert.encoder.layer.19.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.value.weight\", \"bert.encoder.layer.19.attention.self.value.bias\", \"bert.encoder.layer.19.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.19.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.19.attention.output.dense.weight\", \"bert.encoder.layer.19.attention.output.dense.bias\", \"bert.encoder.layer.19.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.19.attention.output.LayerNorm.weight\", \"bert.encoder.layer.19.attention.output.LayerNorm.bias\", \"bert.encoder.layer.19.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.19.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.19.intermediate.dense.weight\", \"bert.encoder.layer.19.intermediate.dense.bias\", \"bert.encoder.layer.19.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.19.output.dense.weight\", \"bert.encoder.layer.19.output.dense.bias\", \"bert.encoder.layer.19.output.dense._input_quantizer._amax\", \"bert.encoder.layer.19.output.LayerNorm.weight\", \"bert.encoder.layer.19.output.LayerNorm.bias\", \"bert.encoder.layer.19.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.19.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.query.weight\", \"bert.encoder.layer.20.attention.self.query.bias\", \"bert.encoder.layer.20.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.key.weight\", \"bert.encoder.layer.20.attention.self.key.bias\", \"bert.encoder.layer.20.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.value.weight\", \"bert.encoder.layer.20.attention.self.value.bias\", \"bert.encoder.layer.20.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.20.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.20.attention.output.dense.weight\", \"bert.encoder.layer.20.attention.output.dense.bias\", \"bert.encoder.layer.20.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.20.attention.output.LayerNorm.weight\", \"bert.encoder.layer.20.attention.output.LayerNorm.bias\", \"bert.encoder.layer.20.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.20.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.20.intermediate.dense.weight\", \"bert.encoder.layer.20.intermediate.dense.bias\", \"bert.encoder.layer.20.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.20.output.dense.weight\", \"bert.encoder.layer.20.output.dense.bias\", \"bert.encoder.layer.20.output.dense._input_quantizer._amax\", \"bert.encoder.layer.20.output.LayerNorm.weight\", \"bert.encoder.layer.20.output.LayerNorm.bias\", \"bert.encoder.layer.20.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.20.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.query.weight\", \"bert.encoder.layer.21.attention.self.query.bias\", \"bert.encoder.layer.21.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.key.weight\", \"bert.encoder.layer.21.attention.self.key.bias\", \"bert.encoder.layer.21.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.value.weight\", \"bert.encoder.layer.21.attention.self.value.bias\", \"bert.encoder.layer.21.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.21.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.21.attention.output.dense.weight\", \"bert.encoder.layer.21.attention.output.dense.bias\", \"bert.encoder.layer.21.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.21.attention.output.LayerNorm.weight\", \"bert.encoder.layer.21.attention.output.LayerNorm.bias\", \"bert.encoder.layer.21.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.21.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.21.intermediate.dense.weight\", \"bert.encoder.layer.21.intermediate.dense.bias\", \"bert.encoder.layer.21.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.21.output.dense.weight\", \"bert.encoder.layer.21.output.dense.bias\", \"bert.encoder.layer.21.output.dense._input_quantizer._amax\", \"bert.encoder.layer.21.output.LayerNorm.weight\", \"bert.encoder.layer.21.output.LayerNorm.bias\", \"bert.encoder.layer.21.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.21.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.query.weight\", \"bert.encoder.layer.22.attention.self.query.bias\", \"bert.encoder.layer.22.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.key.weight\", \"bert.encoder.layer.22.attention.self.key.bias\", \"bert.encoder.layer.22.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.value.weight\", \"bert.encoder.layer.22.attention.self.value.bias\", \"bert.encoder.layer.22.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.22.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.22.attention.output.dense.weight\", \"bert.encoder.layer.22.attention.output.dense.bias\", \"bert.encoder.layer.22.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.22.attention.output.LayerNorm.weight\", \"bert.encoder.layer.22.attention.output.LayerNorm.bias\", \"bert.encoder.layer.22.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.22.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.22.intermediate.dense.weight\", \"bert.encoder.layer.22.intermediate.dense.bias\", \"bert.encoder.layer.22.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.22.output.dense.weight\", \"bert.encoder.layer.22.output.dense.bias\", \"bert.encoder.layer.22.output.dense._input_quantizer._amax\", \"bert.encoder.layer.22.output.LayerNorm.weight\", \"bert.encoder.layer.22.output.LayerNorm.bias\", \"bert.encoder.layer.22.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.22.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.query.weight\", \"bert.encoder.layer.23.attention.self.query.bias\", \"bert.encoder.layer.23.attention.self.query._input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.key.weight\", \"bert.encoder.layer.23.attention.self.key.bias\", \"bert.encoder.layer.23.attention.self.key._input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.value.weight\", \"bert.encoder.layer.23.attention.self.value.bias\", \"bert.encoder.layer.23.attention.self.value._input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.qv_a_input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.qv_b_input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.av_a_input_quantizer._amax\", \"bert.encoder.layer.23.attention.self.av_b_input_quantizer._amax\", \"bert.encoder.layer.23.attention.output.dense.weight\", \"bert.encoder.layer.23.attention.output.dense.bias\", \"bert.encoder.layer.23.attention.output.dense._input_quantizer._amax\", \"bert.encoder.layer.23.attention.output.LayerNorm.weight\", \"bert.encoder.layer.23.attention.output.LayerNorm.bias\", \"bert.encoder.layer.23.attention.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.23.attention.output.add_residual_input_quantizer._amax\", \"bert.encoder.layer.23.intermediate.dense.weight\", \"bert.encoder.layer.23.intermediate.dense.bias\", \"bert.encoder.layer.23.intermediate.dense._input_quantizer._amax\", \"bert.encoder.layer.23.output.dense.weight\", \"bert.encoder.layer.23.output.dense.bias\", \"bert.encoder.layer.23.output.dense._input_quantizer._amax\", \"bert.encoder.layer.23.output.LayerNorm.weight\", \"bert.encoder.layer.23.output.LayerNorm.bias\", \"bert.encoder.layer.23.output.add_local_input_quantizer._amax\", \"bert.encoder.layer.23.output.add_residual_input_quantizer._amax\", \"bert.encoder.final_input_quantizer._amax\", \"bert.pooler.dense.weight\", \"bert.pooler.dense.bias\", \"qa_outputs.weight\", \"qa_outputs.bias\". "
     ]
    }
   ],
   "source": [
    "model.load_state_dict(torch.load('/Users/oliver/Downloads/pytorch_model.bin', map_location=torch.device('cpu')))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "trans-fat",
   "language": "python",
   "name": "trans-fat"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
